diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2015-07-24 16:17:42 -0400 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2015-07-24 16:17:42 -0400 |
| commit | 1a1d503dbb9530a9d3518deef1ad40727edc1736 (patch) | |
| tree | 4757b2cd0fd3a979fa1fad590fdbb1bba2ae9a8d | |
| parent | d4db2fd99715dd731327cf485bd219331b6af781 (diff) | |
| download | rawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.tar.gz rawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.tar.bz2 rawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.zip | |
Added some newline recognition
| -rw-r--r-- | kgramstats.cpp | 86 |
1 files changed, 55 insertions, 31 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index b0c3940..16bf598 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -66,7 +66,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 66 | newSentence = false; | 66 | newSentence = false; |
| 67 | } | 67 | } |
| 68 | 68 | ||
| 69 | if (newClause) | 69 | if (newClause || newSentence) |
| 70 | { | 70 | { |
| 71 | kgram commaKgram(1, ","); | 71 | kgram commaKgram(1, ","); |
| 72 | if (tstats[commaKgram] == NULL) | 72 | if (tstats[commaKgram] == NULL) |
| @@ -78,12 +78,36 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 78 | 78 | ||
| 79 | newClause = false; | 79 | newClause = false; |
| 80 | } | 80 | } |
| 81 | 81 | ||
| 82 | if ((f.length() > 0) && ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))) | 82 | if ((f.length() > 0) && (f[f.length()-1] == '\n')) |
| 83 | { | 83 | { |
| 84 | td->period++; | 84 | td->period++; |
| 85 | newSentence = true; | 85 | newSentence = true; |
| 86 | } | 86 | f.resize(f.length()-1); |
| 87 | } | ||
| 88 | |||
| 89 | if (f.length() > 0) | ||
| 90 | { | ||
| 91 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | ||
| 92 | { | ||
| 93 | if (!newSentence) | ||
| 94 | { | ||
| 95 | td->period++; | ||
| 96 | newSentence = true; | ||
| 97 | } | ||
| 98 | |||
| 99 | f.resize(f.length()-1); | ||
| 100 | } else if (f[f.length()-1] == ',') | ||
| 101 | { | ||
| 102 | if (!newSentence) | ||
| 103 | { | ||
| 104 | td->comma++; | ||
| 105 | newClause = true; | ||
| 106 | } | ||
| 107 | |||
| 108 | f.resize(f.length()-1); | ||
| 109 | } | ||
| 110 | } | ||
| 87 | 111 | ||
| 88 | if (f.length() > 0) | 112 | if (f.length() > 0) |
| 89 | { | 113 | { |
| @@ -92,42 +116,42 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 92 | td->startquote++; | 116 | td->startquote++; |
| 93 | } | 117 | } |
| 94 | 118 | ||
| 95 | if (f[f.length()-1] == '"') | 119 | if (f[0] == '(') |
| 96 | { | 120 | { |
| 97 | td->endquote++; | 121 | td->startparen++; |
| 98 | |||
| 99 | if ((f.length() > 1) && (f[f.length()-2] == ',')) | ||
| 100 | { | ||
| 101 | td->comma++; | ||
| 102 | newClause = true; | ||
| 103 | } | ||
| 104 | } | 122 | } |
| 105 | 123 | ||
| 106 | if (f[f.length()-1] == ',') | 124 | if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')')) |
| 107 | { | 125 | { |
| 108 | td->comma++; | 126 | if (f[f.length()-1] == '"') |
| 109 | newClause = true; | ||
| 110 | |||
| 111 | if ((f.length() > 1) && (f[f.length()-2] == '"')) | ||
| 112 | { | 127 | { |
| 113 | td->endquote++; | 128 | td->endquote++; |
| 129 | } else if (f[f.length()-1] == ')') | ||
| 130 | { | ||
| 131 | td->endparen++; | ||
| 114 | } | 132 | } |
| 115 | 133 | ||
| 116 | if ((f.length() > 1) && (f[f.length()-2] == ')')) | 134 | f.resize(f.length()-1); |
| 135 | |||
| 136 | if (f.length() > 0) | ||
| 117 | { | 137 | { |
| 118 | td->endparen++; | 138 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) |
| 139 | { | ||
| 140 | if (!newSentence) | ||
| 141 | { | ||
| 142 | td->period++; | ||
| 143 | newSentence = true; | ||
| 144 | } | ||
| 145 | } else if (f[f.length()-1] == ',') | ||
| 146 | { | ||
| 147 | if (!newSentence && !newClause) | ||
| 148 | { | ||
| 149 | td->comma++; | ||
| 150 | newClause = true; | ||
| 151 | } | ||
| 152 | } | ||
| 119 | } | 153 | } |
| 120 | } | 154 | } |
| 121 | |||
| 122 | if (f[0] == '(') | ||
| 123 | { | ||
| 124 | td->startparen++; | ||
| 125 | } | ||
| 126 | |||
| 127 | if (f[f.length()-1] == ')') | ||
| 128 | { | ||
| 129 | td->endparen++; | ||
| 130 | } | ||
| 131 | } | 155 | } |
| 132 | 156 | ||
| 133 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 157 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) |
