diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
commit | b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch) | |
tree | f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.cpp | |
parent | fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff) | |
download | rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip |
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 453 |
1 files changed, 277 insertions, 176 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -37,35 +37,11 @@ | |||
37 | #include <iostream> | 37 | #include <iostream> |
38 | #include <cstdlib> | 38 | #include <cstdlib> |
39 | #include <algorithm> | 39 | #include <algorithm> |
40 | #include "malaprop.h" | 40 | #include <set> |
41 | #include <stack> | ||
41 | 42 | ||
42 | query wildcardQuery(querytype_sentence); | 43 | query wildcardQuery {querytype::sentence}; |
43 | 44 | word blank_word {""}; | |
44 | std::string canonize(std::string f); | ||
45 | |||
46 | token token_from_string(std::string in) | ||
47 | { | ||
48 | if (in[0] == '#') | ||
49 | { | ||
50 | token word(tokentype_hashtag); | ||
51 | |||
52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
53 | { | ||
54 | word.terminating = true; | ||
55 | } | ||
56 | |||
57 | return word; | ||
58 | } else { | ||
59 | token word(canonize(in)); | ||
60 | |||
61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
62 | { | ||
63 | word.terminating = true; | ||
64 | } | ||
65 | |||
66 | return word; | ||
67 | } | ||
68 | } | ||
69 | 45 | ||
70 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 46 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
71 | // We consider maxK to be fairly constant | 47 | // We consider maxK to be fairly constant |
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
73 | { | 49 | { |
74 | this->maxK = maxK; | 50 | this->maxK = maxK; |
75 | 51 | ||
76 | std::vector<std::string> tokens; | 52 | std::vector<token> tokens; |
77 | size_t start = 0; | 53 | size_t start = 0; |
78 | int end = 0; | 54 | int end = 0; |
79 | std::set<std::string> thashtags; | 55 | std::set<std::string> thashtags; |
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
82 | { | 58 | { |
83 | end = corpus.find(" ", start); | 59 | end = corpus.find(" ", start); |
84 | 60 | ||
85 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
86 | if (token[token.length()-1] == '\n') | 62 | if (t.compare("") && t.compare(".")) |
87 | { | 63 | { |
88 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) | 64 | std::string tc(t), canonical; |
65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | ||
66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | ||
67 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
68 | }); | ||
69 | |||
70 | word& w = ([&] () -> word& { | ||
71 | // Hashtag freevar | ||
72 | if (canonical[0] == '#') | ||
73 | { | ||
74 | thashtags.insert(canonical); | ||
75 | canonical = "#hashtag"; | ||
76 | |||
77 | return hashtags; | ||
78 | } | ||
79 | |||
80 | // Basically any other word | ||
81 | if (words.count(canonical) == 0) | ||
82 | { | ||
83 | words.emplace(canonical, canonical); | ||
84 | } | ||
85 | |||
86 | word& tw = words.at(canonical); | ||
87 | tw.forms.add(canonical); | ||
88 | |||
89 | return tw; | ||
90 | })(); | ||
91 | |||
92 | token tk(w); | ||
93 | tk.raw = t; | ||
94 | |||
95 | for (char c : t) | ||
89 | { | 96 | { |
90 | token.insert(token.length()-1, "."); | 97 | if (c == '*') |
98 | { | ||
99 | tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; | ||
100 | } else if (c == '[') | ||
101 | { | ||
102 | tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; | ||
103 | } else if (c == '(') | ||
104 | { | ||
105 | tk.delimiters[{parentype::paren, doublestatus::opening}]++; | ||
106 | } else if (c == '"') | ||
107 | { | ||
108 | tk.delimiters[{parentype::quote, doublestatus::opening}]++; | ||
109 | } else { | ||
110 | break; | ||
111 | } | ||
91 | } | 112 | } |
92 | |||
93 | token.resize(token.length()-1); | ||
94 | } | ||
95 | |||
96 | if (token.compare("") && token.compare(".")) | ||
97 | { | ||
98 | mstats.addWord(token); | ||
99 | tokens.push_back(token); | ||
100 | 113 | ||
101 | if (token[0] == '#') | 114 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; |
115 | if (backtrack != t.length()) | ||
102 | { | 116 | { |
103 | thashtags.insert(canonize(token)); | 117 | std::string ending = t.substr(backtrack); |
118 | std::string suffix; | ||
119 | |||
120 | for (char c : ending) | ||
121 | { | ||
122 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | ||
123 | { | ||
124 | suffix += c; | ||
125 | |||
126 | continue; | ||
127 | } else if (c == '\n') | ||
128 | { | ||
129 | // At least the end is coming | ||
130 | if (suffix.empty()) | ||
131 | { | ||
132 | suffix = "."; | ||
133 | } | ||
134 | |||
135 | break; | ||
136 | } | ||
137 | |||
138 | parentype pt = ([&] { | ||
139 | switch (c) | ||
140 | { | ||
141 | case ']': return parentype::square_bracket; | ||
142 | case ')': return parentype::paren; | ||
143 | case '*': return parentype::asterisk; | ||
144 | case '"': return parentype::quote; | ||
145 | } | ||
146 | })(); | ||
147 | |||
148 | if (tk.delimiters[{pt, doublestatus::opening}] > 0) | ||
149 | { | ||
150 | tk.delimiters[{pt, doublestatus::opening}]--; | ||
151 | tk.delimiters[{pt, doublestatus::both}]++; | ||
152 | } else { | ||
153 | tk.delimiters[{pt, doublestatus::closing}]++; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | if (suffix == ",") | ||
158 | { | ||
159 | tk.suffix = suffixtype::comma; | ||
160 | } else if (!suffix.empty()) { | ||
161 | tk.suffix = suffixtype::terminating; | ||
162 | |||
163 | w.terms.add(suffix); | ||
164 | } | ||
104 | } | 165 | } |
166 | |||
167 | tokens.push_back(tk); | ||
105 | } | 168 | } |
106 | 169 | ||
107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 170 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
108 | } | 171 | } |
109 | 172 | ||
110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | 173 | // Time to condense the distribution stuff for the words |
174 | for (auto& it : words) | ||
111 | { | 175 | { |
112 | hashtags.push_back(*it); | 176 | it.second.forms.compile(); |
177 | it.second.terms.compile(); | ||
113 | } | 178 | } |
114 | 179 | ||
180 | // Hashtag freevar is not frequency distributed | ||
181 | for (auto& it : thashtags) | ||
182 | { | ||
183 | hashtags.forms.add(it); | ||
184 | } | ||
185 | |||
186 | hashtags.forms.compile(); | ||
187 | hashtags.terms.compile(); | ||
188 | |||
189 | // kgram distribution | ||
115 | std::map<kgram, std::map<token, token_data> > tstats; | 190 | std::map<kgram, std::map<token, token_data> > tstats; |
116 | std::map<token, std::map<termstats, int> > tendings; | ||
117 | for (int k=1; k<maxK; k++) | 191 | for (int k=1; k<maxK; k++) |
118 | { | 192 | { |
119 | for (int i=0; i<(tokens.size() - k); i++) | 193 | for (int i=0; i<(tokens.size() - k); i++) |
120 | { | 194 | { |
121 | std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); | 195 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
122 | kgram prefix; | 196 | token f = tokens[i+k]; |
123 | 197 | ||
124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 198 | if (tstats[prefix].count(f) == 0) |
125 | { | ||
126 | prefix.push_back(token_from_string(*it)); | ||
127 | } | ||
128 | |||
129 | std::string f = tokens[i+k]; | ||
130 | std::string canonical = canonize(f); | ||
131 | |||
132 | token word(token_from_string(canonical)); | ||
133 | if (f.find_first_of(".?!,") != std::string::npos) | ||
134 | { | 199 | { |
135 | word.terminating = true; | 200 | tstats[prefix].emplace(f, f); |
136 | |||
137 | char terminator = f[f.find_last_of(".?!,")]; | ||
138 | int occurrences = std::count(f.begin(), f.end(), terminator); | ||
139 | |||
140 | tendings[word][termstats(terminator, occurrences)]++; | ||
141 | } | 201 | } |
142 | 202 | ||
143 | token_data& td = tstats[prefix][word]; | 203 | token_data& td = tstats[prefix].at(f); |
144 | td.word = word; | ||
145 | td.all++; | 204 | td.all++; |
146 | 205 | ||
147 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 206 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
148 | { | 207 | { |
149 | td.uppercase++; | 208 | td.uppercase++; |
150 | } else if (isupper(f[0])) | 209 | } else if (isupper(f.raw[0])) |
151 | { | 210 | { |
152 | td.titlecase++; | 211 | td.titlecase++; |
153 | } | 212 | } |
154 | 213 | ||
155 | if (prefix.front().word.terminating) | 214 | kgram term_prefix; |
215 | bool changed = false; | ||
216 | std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) { | ||
217 | if (q.tok.suffix == suffixtype::terminating) | ||
218 | { | ||
219 | changed = true; | ||
220 | |||
221 | return wildcardQuery; | ||
222 | } else { | ||
223 | return q; | ||
224 | } | ||
225 | }); | ||
226 | |||
227 | if (changed) | ||
156 | { | 228 | { |
157 | prefix.front() = wildcardQuery; | 229 | if (tstats[term_prefix].count(f) == 0) |
230 | { | ||
231 | tstats[term_prefix].emplace(f, f); | ||
232 | } | ||
158 | 233 | ||
159 | token_data& td2 = tstats[prefix][word]; | 234 | token_data& td2 = tstats[term_prefix].at(f); |
160 | td2.word = word; | ||
161 | td2.all++; | 235 | td2.all++; |
162 | 236 | ||
163 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 237 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
164 | { | 238 | { |
165 | td2.uppercase++; | 239 | td2.uppercase++; |
166 | } else if (isupper(f[0])) | 240 | } else if (isupper(f.raw[0])) |
167 | { | 241 | { |
168 | td2.titlecase++; | 242 | td2.titlecase++; |
169 | } | 243 | } |
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
171 | } | 245 | } |
172 | } | 246 | } |
173 | 247 | ||
174 | for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) | 248 | // Condense the kgram distribution |
249 | for (auto& it : tstats) | ||
175 | { | 250 | { |
176 | kgram klist = it->first; | 251 | kgram klist = it.first; |
177 | std::map<token, token_data>& probtable = it->second; | 252 | auto& probtable = it.second; |
178 | std::map<int, token_data>& distribution = stats[klist]; | 253 | auto& distribution = stats[klist]; |
179 | int max = 0; | 254 | int max = 0; |
180 | 255 | ||
181 | for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | 256 | for (auto& kt : probtable) |
182 | { | 257 | { |
183 | max += kt->second.all; | 258 | max += kt.second.all; |
184 | 259 | ||
185 | distribution[max] = kt->second; | 260 | distribution.emplace(max, kt.second); |
186 | } | ||
187 | } | ||
188 | |||
189 | for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++) | ||
190 | { | ||
191 | token word = it->first; | ||
192 | std::map<termstats, int>& probtable = it->second; | ||
193 | std::map<int, termstats>& distribution = endings[word]; | ||
194 | int max = 0; | ||
195 | |||
196 | for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | ||
197 | { | ||
198 | max += kt->second; | ||
199 | |||
200 | distribution[max] = kt->first; | ||
201 | } | 261 | } |
202 | } | 262 | } |
203 | } | 263 | } |
204 | 264 | ||
205 | void printKgram(kgram k) | 265 | void printKgram(kgram k) |
206 | { | 266 | { |
207 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 267 | for (auto& q : k) |
208 | { | 268 | { |
209 | query& q = *it; | 269 | if (q.type == querytype::sentence) |
210 | if (q.type == querytype_sentence) | ||
211 | { | 270 | { |
212 | std::cout << "#.# "; | 271 | std::cout << "#.# "; |
213 | } else if (q.type == querytype_literal) | 272 | } else if (q.type == querytype::literal) |
214 | { | 273 | { |
215 | if (q.word.type == tokentype_hashtag) | 274 | if (q.tok.suffix == suffixtype::terminating) |
216 | { | 275 | { |
217 | if (q.word.terminating) | 276 | std::cout << q.tok.w.canon << ". "; |
218 | { | 277 | } else if (q.tok.suffix == suffixtype::comma) |
219 | std::cout << "#hashtag. "; | ||
220 | } else { | ||
221 | std::cout << "#hashtag "; | ||
222 | } | ||
223 | } else if (q.word.type == tokentype_literal) | ||
224 | { | 278 | { |
225 | if (q.word.terminating) | 279 | std::cout << q.tok.w.canon << ", "; |
226 | { | 280 | } else { |
227 | std::cout << q.word.canon << ". "; | 281 | std::cout << q.tok.w.canon << " "; |
228 | } else { | ||
229 | std::cout << q.word.canon << " "; | ||
230 | } | ||
231 | } | 282 | } |
232 | } | 283 | } |
233 | } | 284 | } |
234 | } | 285 | } |
235 | 286 | ||
236 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 287 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
237 | std::vector<std::string> kgramstats::randomSentence(int n) | 288 | std::string kgramstats::randomSentence(int n) |
238 | { | 289 | { |
239 | std::vector<std::string> result; | 290 | std::string result; |
240 | kgram cur(1, wildcardQuery); | 291 | kgram cur(1, wildcardQuery); |
241 | int cuts = 0; | 292 | int cuts = 0; |
293 | std::stack<parentype> open_delimiters; | ||
242 | 294 | ||
243 | for (int i=0; i<n; i++) | 295 | for (int i=0; i<n; i++) |
244 | { | 296 | { |
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
273 | cur = kgram(1, wildcardQuery); | 325 | cur = kgram(1, wildcardQuery); |
274 | } | 326 | } |
275 | 327 | ||
276 | std::map<int, token_data>& distribution = stats[cur]; | 328 | auto& distribution = stats[cur]; |
277 | int max = distribution.rbegin()->first; | 329 | int max = distribution.rbegin()->first; |
278 | int r = rand() % max; | 330 | int r = rand() % max; |
279 | token_data& next = distribution.upper_bound(r)->second; | 331 | token_data& next = distribution.upper_bound(r)->second; |
280 | std::string nextToken; | 332 | std::string nextToken = next.tok.w.forms.next(); |
281 | bool mess = false; | 333 | |
282 | 334 | // Determine the casing of the next token. We randomly make the token all | |
283 | if (next.word.type == tokentype_literal) | 335 | // caps based on the markov chain. Otherwise, we check if the previous |
336 | // token is the end of a sentence (terminating token or a wildcard query). | ||
337 | int casing = rand() % next.all; | ||
338 | if (casing < next.uppercase) | ||
339 | { | ||
340 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
341 | } else if ((((cur.rbegin()->type == querytype::sentence) | ||
342 | || ((cur.rbegin()->type == querytype::literal) | ||
343 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | ||
344 | && (rand() % 2 > 0)) | ||
345 | || (casing - next.uppercase < next.titlecase)) | ||
284 | { | 346 | { |
285 | nextToken = next.word.canon; | 347 | nextToken[0] = toupper(nextToken[0]); |
348 | } | ||
286 | 349 | ||
287 | mess = (rand() % 100) == 0; | 350 | // Delimiters |
288 | if (mess) | 351 | for (auto& dt : next.tok.delimiters) |
352 | { | ||
353 | if (dt.first.status == doublestatus::both) | ||
289 | { | 354 | { |
290 | nextToken = mstats.alternate(nextToken); | 355 | switch (dt.first.type) |
291 | } | 356 | { |
292 | 357 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; | |
293 | // Determine the casing of the next token. We randomly make the token all | 358 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; |
294 | // caps based on the markov chain. Otherwise, we check if the previous | 359 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; |
295 | // token is the end of a sentence (terminating token or a wildcard query). | 360 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; |
296 | int casing = rand() % next.all; | 361 | } |
297 | if (casing < next.uppercase) | 362 | } else if (dt.first.status == doublestatus::opening) |
298 | { | 363 | { |
299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 364 | for (int i=0; i<dt.second; i++) |
300 | } else if ((((cur.rbegin()->type == querytype_sentence) | 365 | { |
301 | || ((cur.rbegin()->type == querytype_literal) | 366 | open_delimiters.push(dt.first.type); |
302 | && (cur.rbegin()->word.terminating))) | 367 | } |
303 | && (rand() % 2 > 0)) | 368 | |
304 | || (casing - next.uppercase < next.titlecase)) | 369 | switch (dt.first.type) |
370 | { | ||
371 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break; | ||
372 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break; | ||
373 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break; | ||
374 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break; | ||
375 | } | ||
376 | } else if (dt.first.status == doublestatus::closing) | ||
305 | { | 377 | { |
306 | nextToken[0] = toupper(nextToken[0]); | 378 | for (int i=0; i<dt.second; i++) |
379 | { | ||
380 | while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type)) | ||
381 | { | ||
382 | switch (open_delimiters.top()) | ||
383 | { | ||
384 | case parentype::paren: nextToken.append(")"); break; | ||
385 | case parentype::square_bracket: nextToken.append("]"); break; | ||
386 | case parentype::asterisk: nextToken.append("*"); break; | ||
387 | case parentype::quote: nextToken.append("\""); break; | ||
388 | } | ||
389 | |||
390 | open_delimiters.pop(); | ||
391 | } | ||
392 | |||
393 | if (open_delimiters.empty()) | ||
394 | { | ||
395 | switch (dt.first.type) | ||
396 | { | ||
397 | case parentype::paren: result = "(" + result; break; | ||
398 | case parentype::square_bracket: result = "[" + result; break; | ||
399 | case parentype::asterisk: result = "*" + result; break; | ||
400 | case parentype::quote: result = "\"" + result; break; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | switch (dt.first.type) | ||
405 | { | ||
406 | case parentype::paren: nextToken.append(")"); break; | ||
407 | case parentype::square_bracket: nextToken.append("]"); break; | ||
408 | case parentype::asterisk: nextToken.append("*"); break; | ||
409 | case parentype::quote: nextToken.append("\""); break; | ||
410 | } | ||
411 | } | ||
307 | } | 412 | } |
308 | } else if (next.word.type == tokentype_hashtag) | ||
309 | { | ||
310 | int rhash = rand() % hashtags.size(); | ||
311 | nextToken = hashtags[rhash]; | ||
312 | } | 413 | } |
313 | 414 | ||
314 | if (next.word.terminating) | 415 | // Terminators |
416 | if (next.tok.suffix == suffixtype::terminating) | ||
315 | { | 417 | { |
316 | std::map<int, termstats>& ending = endings[next.word]; | 418 | nextToken.append(next.tok.w.terms.next()); |
317 | int emax = ending.rbegin()->first; | 419 | } else if (next.tok.suffix == suffixtype::comma) |
318 | int er = rand() % emax; | 420 | { |
319 | termstats& nextend = ending.upper_bound(er)->second; | 421 | nextToken.append(","); |
320 | |||
321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | ||
322 | } | 422 | } |
323 | 423 | ||
324 | /* DEBUG */ | 424 | /* DEBUG */ |
325 | printKgram(cur); | 425 | printKgram(cur); |
426 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | ||
427 | |||
428 | cur.push_back(next.tok); | ||
326 | 429 | ||
327 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; | 430 | result.append(nextToken + " "); |
328 | 431 | ||
329 | if (mess) | 432 | if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) |
330 | { | 433 | { |
331 | std::cout << " mala " << next.word.canon; | 434 | break; |
332 | } | 435 | } |
333 | |||
334 | std::cout << std::endl; | ||
335 | |||
336 | cur.push_back(next.word); | ||
337 | |||
338 | result.push_back(nextToken); | ||
339 | } | 436 | } |
340 | |||
341 | return result; | ||
342 | } | ||
343 | |||
344 | bool removeIf(char c) | ||
345 | { | ||
346 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); | ||
347 | } | ||
348 | |||
349 | std::string canonize(std::string f) | ||
350 | { | ||
351 | std::string canonical(f); | ||
352 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
353 | 437 | ||
354 | std::string result; | 438 | // Remove the trailing space |
355 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 439 | if (result.back() == ' ') |
440 | { | ||
441 | result.pop_back(); | ||
442 | } | ||
443 | |||
444 | // Close any open delimiters | ||
445 | while (!open_delimiters.empty()) | ||
446 | { | ||
447 | switch (open_delimiters.top()) | ||
448 | { | ||
449 | case parentype::paren: result.append(")"); break; | ||
450 | case parentype::square_bracket: result.append("]"); break; | ||
451 | case parentype::asterisk: result.append("*"); break; | ||
452 | case parentype::quote: result.append("\""); break; | ||
453 | } | ||
454 | |||
455 | open_delimiters.pop(); | ||
456 | } | ||
356 | 457 | ||
357 | return result; | 458 | return result; |
358 | } | 459 | } |