diff options
| -rw-r--r-- | kgramstats.cpp | 86 | 
1 files changed, 55 insertions, 31 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index b0c3940..16bf598 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -66,7 +66,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 66 | newSentence = false; | 66 | newSentence = false; | 
| 67 | } | 67 | } | 
| 68 | 68 | ||
| 69 | if (newClause) | 69 | if (newClause || newSentence) | 
| 70 | { | 70 | { | 
| 71 | kgram commaKgram(1, ","); | 71 | kgram commaKgram(1, ","); | 
| 72 | if (tstats[commaKgram] == NULL) | 72 | if (tstats[commaKgram] == NULL) | 
| @@ -78,12 +78,36 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 78 | 78 | ||
| 79 | newClause = false; | 79 | newClause = false; | 
| 80 | } | 80 | } | 
| 81 | 81 | ||
| 82 | if ((f.length() > 0) && ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))) | 82 | if ((f.length() > 0) && (f[f.length()-1] == '\n')) | 
| 83 | { | 83 | { | 
| 84 | td->period++; | 84 | td->period++; | 
| 85 | newSentence = true; | 85 | newSentence = true; | 
| 86 | } | 86 | f.resize(f.length()-1); | 
| 87 | } | ||
| 88 | |||
| 89 | if (f.length() > 0) | ||
| 90 | { | ||
| 91 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | ||
| 92 | { | ||
| 93 | if (!newSentence) | ||
| 94 | { | ||
| 95 | td->period++; | ||
| 96 | newSentence = true; | ||
| 97 | } | ||
| 98 | |||
| 99 | f.resize(f.length()-1); | ||
| 100 | } else if (f[f.length()-1] == ',') | ||
| 101 | { | ||
| 102 | if (!newSentence) | ||
| 103 | { | ||
| 104 | td->comma++; | ||
| 105 | newClause = true; | ||
| 106 | } | ||
| 107 | |||
| 108 | f.resize(f.length()-1); | ||
| 109 | } | ||
| 110 | } | ||
| 87 | 111 | ||
| 88 | if (f.length() > 0) | 112 | if (f.length() > 0) | 
| 89 | { | 113 | { | 
| @@ -92,42 +116,42 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 92 | td->startquote++; | 116 | td->startquote++; | 
| 93 | } | 117 | } | 
| 94 | 118 | ||
| 95 | if (f[f.length()-1] == '"') | 119 | if (f[0] == '(') | 
| 96 | { | 120 | { | 
| 97 | td->endquote++; | 121 | td->startparen++; | 
| 98 | |||
| 99 | if ((f.length() > 1) && (f[f.length()-2] == ',')) | ||
| 100 | { | ||
| 101 | td->comma++; | ||
| 102 | newClause = true; | ||
| 103 | } | ||
| 104 | } | 122 | } | 
| 105 | 123 | ||
| 106 | if (f[f.length()-1] == ',') | 124 | if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')')) | 
| 107 | { | 125 | { | 
| 108 | td->comma++; | 126 | if (f[f.length()-1] == '"') | 
| 109 | newClause = true; | ||
| 110 | |||
| 111 | if ((f.length() > 1) && (f[f.length()-2] == '"')) | ||
| 112 | { | 127 | { | 
| 113 | td->endquote++; | 128 | td->endquote++; | 
| 129 | } else if (f[f.length()-1] == ')') | ||
| 130 | { | ||
| 131 | td->endparen++; | ||
| 114 | } | 132 | } | 
| 115 | 133 | ||
| 116 | if ((f.length() > 1) && (f[f.length()-2] == ')')) | 134 | f.resize(f.length()-1); | 
| 135 | |||
| 136 | if (f.length() > 0) | ||
| 117 | { | 137 | { | 
| 118 | td->endparen++; | 138 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | 
| 139 | { | ||
| 140 | if (!newSentence) | ||
| 141 | { | ||
| 142 | td->period++; | ||
| 143 | newSentence = true; | ||
| 144 | } | ||
| 145 | } else if (f[f.length()-1] == ',') | ||
| 146 | { | ||
| 147 | if (!newSentence && !newClause) | ||
| 148 | { | ||
| 149 | td->comma++; | ||
| 150 | newClause = true; | ||
| 151 | } | ||
| 152 | } | ||
| 119 | } | 153 | } | 
| 120 | } | 154 | } | 
| 121 | |||
| 122 | if (f[0] == '(') | ||
| 123 | { | ||
| 124 | td->startparen++; | ||
| 125 | } | ||
| 126 | |||
| 127 | if (f[f.length()-1] == ')') | ||
| 128 | { | ||
| 129 | td->endparen++; | ||
| 130 | } | ||
| 131 | } | 155 | } | 
| 132 | 156 | ||
| 133 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 157 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 
