about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
commitb316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
treef21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.cpp
parentfd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
downloadrawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags.

Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp453
1 files changed, 277 insertions, 176 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37#include <iostream> 37#include <iostream>
38#include <cstdlib> 38#include <cstdlib>
39#include <algorithm> 39#include <algorithm>
40#include "malaprop.h" 40#include <set>
41#include <stack>
41 42
42query wildcardQuery(querytype_sentence); 43query wildcardQuery {querytype::sentence};
43 44word blank_word {""};
44std::string canonize(std::string f);
45
46token token_from_string(std::string in)
47{
48 if (in[0] == '#')
49 {
50 token word(tokentype_hashtag);
51
52 if (in.find_first_of(".?!,") != std::string::npos)
53 {
54 word.terminating = true;
55 }
56
57 return word;
58 } else {
59 token word(canonize(in));
60
61 if (in.find_first_of(".?!,") != std::string::npos)
62 {
63 word.terminating = true;
64 }
65
66 return word;
67 }
68}
69 45
70// runs in O(t^2) time where t is the number of tokens in the input corpus 46// runs in O(t^2) time where t is the number of tokens in the input corpus
71// We consider maxK to be fairly constant 47// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73{ 49{
74 this->maxK = maxK; 50 this->maxK = maxK;
75 51
76 std::vector<std::string> tokens; 52 std::vector<token> tokens;
77 size_t start = 0; 53 size_t start = 0;
78 int end = 0; 54 int end = 0;
79 std::set<std::string> thashtags; 55 std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82 { 58 {
83 end = corpus.find(" ", start); 59 end = corpus.find(" ", start);
84 60
85 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86 if (token[token.length()-1] == '\n') 62 if (t.compare("") && t.compare("."))
87 { 63 {
88 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) 64 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
67 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
68 });
69
70 word& w = ([&] () -> word& {
71 // Hashtag freevar
72 if (canonical[0] == '#')
73 {
74 thashtags.insert(canonical);
75 canonical = "#hashtag";
76
77 return hashtags;
78 }
79
80 // Basically any other word
81 if (words.count(canonical) == 0)
82 {
83 words.emplace(canonical, canonical);
84 }
85
86 word& tw = words.at(canonical);
87 tw.forms.add(canonical);
88
89 return tw;
90 })();
91
92 token tk(w);
93 tk.raw = t;
94
95 for (char c : t)
89 { 96 {
90 token.insert(token.length()-1, "."); 97 if (c == '*')
98 {
99 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
100 } else if (c == '[')
101 {
102 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
103 } else if (c == '(')
104 {
105 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
106 } else if (c == '"')
107 {
108 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
109 } else {
110 break;
111 }
91 } 112 }
92
93 token.resize(token.length()-1);
94 }
95
96 if (token.compare("") && token.compare("."))
97 {
98 mstats.addWord(token);
99 tokens.push_back(token);
100 113
101 if (token[0] == '#') 114 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
115 if (backtrack != t.length())
102 { 116 {
103 thashtags.insert(canonize(token)); 117 std::string ending = t.substr(backtrack);
118 std::string suffix;
119
120 for (char c : ending)
121 {
122 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
123 {
124 suffix += c;
125
126 continue;
127 } else if (c == '\n')
128 {
129 // At least the end is coming
130 if (suffix.empty())
131 {
132 suffix = ".";
133 }
134
135 break;
136 }
137
138 parentype pt = ([&] {
139 switch (c)
140 {
141 case ']': return parentype::square_bracket;
142 case ')': return parentype::paren;
143 case '*': return parentype::asterisk;
144 case '"': return parentype::quote;
145 }
146 })();
147
148 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
149 {
150 tk.delimiters[{pt, doublestatus::opening}]--;
151 tk.delimiters[{pt, doublestatus::both}]++;
152 } else {
153 tk.delimiters[{pt, doublestatus::closing}]++;
154 }
155 }
156
157 if (suffix == ",")
158 {
159 tk.suffix = suffixtype::comma;
160 } else if (!suffix.empty()) {
161 tk.suffix = suffixtype::terminating;
162
163 w.terms.add(suffix);
164 }
104 } 165 }
166
167 tokens.push_back(tk);
105 } 168 }
106 169
107 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 170 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108 } 171 }
109 172
110 for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) 173 // Time to condense the distribution stuff for the words
174 for (auto& it : words)
111 { 175 {
112 hashtags.push_back(*it); 176 it.second.forms.compile();
177 it.second.terms.compile();
113 } 178 }
114 179
180 // Hashtag freevar is not frequency distributed
181 for (auto& it : thashtags)
182 {
183 hashtags.forms.add(it);
184 }
185
186 hashtags.forms.compile();
187 hashtags.terms.compile();
188
189 // kgram distribution
115 std::map<kgram, std::map<token, token_data> > tstats; 190 std::map<kgram, std::map<token, token_data> > tstats;
116 std::map<token, std::map<termstats, int> > tendings;
117 for (int k=1; k<maxK; k++) 191 for (int k=1; k<maxK; k++)
118 { 192 {
119 for (int i=0; i<(tokens.size() - k); i++) 193 for (int i=0; i<(tokens.size() - k); i++)
120 { 194 {
121 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); 195 kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122 kgram prefix; 196 token f = tokens[i+k];
123 197
124 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) 198 if (tstats[prefix].count(f) == 0)
125 {
126 prefix.push_back(token_from_string(*it));
127 }
128
129 std::string f = tokens[i+k];
130 std::string canonical = canonize(f);
131
132 token word(token_from_string(canonical));
133 if (f.find_first_of(".?!,") != std::string::npos)
134 { 199 {
135 word.terminating = true; 200 tstats[prefix].emplace(f, f);
136
137 char terminator = f[f.find_last_of(".?!,")];
138 int occurrences = std::count(f.begin(), f.end(), terminator);
139
140 tendings[word][termstats(terminator, occurrences)]++;
141 } 201 }
142 202
143 token_data& td = tstats[prefix][word]; 203 token_data& td = tstats[prefix].at(f);
144 td.word = word;
145 td.all++; 204 td.all++;
146 205
147 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 206 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148 { 207 {
149 td.uppercase++; 208 td.uppercase++;
150 } else if (isupper(f[0])) 209 } else if (isupper(f.raw[0]))
151 { 210 {
152 td.titlecase++; 211 td.titlecase++;
153 } 212 }
154 213
155 if (prefix.front().word.terminating) 214 kgram term_prefix;
215 bool changed = false;
216 std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
217 if (q.tok.suffix == suffixtype::terminating)
218 {
219 changed = true;
220
221 return wildcardQuery;
222 } else {
223 return q;
224 }
225 });
226
227 if (changed)
156 { 228 {
157 prefix.front() = wildcardQuery; 229 if (tstats[term_prefix].count(f) == 0)
230 {
231 tstats[term_prefix].emplace(f, f);
232 }
158 233
159 token_data& td2 = tstats[prefix][word]; 234 token_data& td2 = tstats[term_prefix].at(f);
160 td2.word = word;
161 td2.all++; 235 td2.all++;
162 236
163 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 237 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164 { 238 {
165 td2.uppercase++; 239 td2.uppercase++;
166 } else if (isupper(f[0])) 240 } else if (isupper(f.raw[0]))
167 { 241 {
168 td2.titlecase++; 242 td2.titlecase++;
169 } 243 }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 245 }
172 } 246 }
173 247
174 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) 248 // Condense the kgram distribution
249 for (auto& it : tstats)
175 { 250 {
176 kgram klist = it->first; 251 kgram klist = it.first;
177 std::map<token, token_data>& probtable = it->second; 252 auto& probtable = it.second;
178 std::map<int, token_data>& distribution = stats[klist]; 253 auto& distribution = stats[klist];
179 int max = 0; 254 int max = 0;
180 255
181 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) 256 for (auto& kt : probtable)
182 { 257 {
183 max += kt->second.all; 258 max += kt.second.all;
184 259
185 distribution[max] = kt->second; 260 distribution.emplace(max, kt.second);
186 }
187 }
188
189 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190 {
191 token word = it->first;
192 std::map<termstats, int>& probtable = it->second;
193 std::map<int, termstats>& distribution = endings[word];
194 int max = 0;
195
196 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197 {
198 max += kt->second;
199
200 distribution[max] = kt->first;
201 } 261 }
202 } 262 }
203} 263}
204 264
205void printKgram(kgram k) 265void printKgram(kgram k)
206{ 266{
207 for (kgram::iterator it = k.begin(); it != k.end(); it++) 267 for (auto& q : k)
208 { 268 {
209 query& q = *it; 269 if (q.type == querytype::sentence)
210 if (q.type == querytype_sentence)
211 { 270 {
212 std::cout << "#.# "; 271 std::cout << "#.# ";
213 } else if (q.type == querytype_literal) 272 } else if (q.type == querytype::literal)
214 { 273 {
215 if (q.word.type == tokentype_hashtag) 274 if (q.tok.suffix == suffixtype::terminating)
216 { 275 {
217 if (q.word.terminating) 276 std::cout << q.tok.w.canon << ". ";
218 { 277 } else if (q.tok.suffix == suffixtype::comma)
219 std::cout << "#hashtag. ";
220 } else {
221 std::cout << "#hashtag ";
222 }
223 } else if (q.word.type == tokentype_literal)
224 { 278 {
225 if (q.word.terminating) 279 std::cout << q.tok.w.canon << ", ";
226 { 280 } else {
227 std::cout << q.word.canon << ". "; 281 std::cout << q.tok.w.canon << " ";
228 } else {
229 std::cout << q.word.canon << " ";
230 }
231 } 282 }
232 } 283 }
233 } 284 }
234} 285}
235 286
236// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 287// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237std::vector<std::string> kgramstats::randomSentence(int n) 288std::string kgramstats::randomSentence(int n)
238{ 289{
239 std::vector<std::string> result; 290 std::string result;
240 kgram cur(1, wildcardQuery); 291 kgram cur(1, wildcardQuery);
241 int cuts = 0; 292 int cuts = 0;
293 std::stack<parentype> open_delimiters;
242 294
243 for (int i=0; i<n; i++) 295 for (int i=0; i<n; i++)
244 { 296 {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273 cur = kgram(1, wildcardQuery); 325 cur = kgram(1, wildcardQuery);
274 } 326 }
275 327
276 std::map<int, token_data>& distribution = stats[cur]; 328 auto& distribution = stats[cur];
277 int max = distribution.rbegin()->first; 329 int max = distribution.rbegin()->first;
278 int r = rand() % max; 330 int r = rand() % max;
279 token_data& next = distribution.upper_bound(r)->second; 331 token_data& next = distribution.upper_bound(r)->second;
280 std::string nextToken; 332 std::string nextToken = next.tok.w.forms.next();
281 bool mess = false; 333
282 334 // Determine the casing of the next token. We randomly make the token all
283 if (next.word.type == tokentype_literal) 335 // caps based on the markov chain. Otherwise, we check if the previous
336 // token is the end of a sentence (terminating token or a wildcard query).
337 int casing = rand() % next.all;
338 if (casing < next.uppercase)
339 {
340 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
341 } else if ((((cur.rbegin()->type == querytype::sentence)
342 || ((cur.rbegin()->type == querytype::literal)
343 && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
344 && (rand() % 2 > 0))
345 || (casing - next.uppercase < next.titlecase))
284 { 346 {
285 nextToken = next.word.canon; 347 nextToken[0] = toupper(nextToken[0]);
348 }
286 349
287 mess = (rand() % 100) == 0; 350 // Delimiters
288 if (mess) 351 for (auto& dt : next.tok.delimiters)
352 {
353 if (dt.first.status == doublestatus::both)
289 { 354 {
290 nextToken = mstats.alternate(nextToken); 355 switch (dt.first.type)
291 } 356 {
292 357 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293 // Determine the casing of the next token. We randomly make the token all 358 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294 // caps based on the markov chain. Otherwise, we check if the previous 359 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
295 // token is the end of a sentence (terminating token or a wildcard query). 360 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296 int casing = rand() % next.all; 361 }
297 if (casing < next.uppercase) 362 } else if (dt.first.status == doublestatus::opening)
298 { 363 {
299 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 364 for (int i=0; i<dt.second; i++)
300 } else if ((((cur.rbegin()->type == querytype_sentence) 365 {
301 || ((cur.rbegin()->type == querytype_literal) 366 open_delimiters.push(dt.first.type);
302 && (cur.rbegin()->word.terminating))) 367 }
303 && (rand() % 2 > 0)) 368
304 || (casing - next.uppercase < next.titlecase)) 369 switch (dt.first.type)
370 {
371 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
372 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
373 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
374 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
375 }
376 } else if (dt.first.status == doublestatus::closing)
305 { 377 {
306 nextToken[0] = toupper(nextToken[0]); 378 for (int i=0; i<dt.second; i++)
379 {
380 while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
381 {
382 switch (open_delimiters.top())
383 {
384 case parentype::paren: nextToken.append(")"); break;
385 case parentype::square_bracket: nextToken.append("]"); break;
386 case parentype::asterisk: nextToken.append("*"); break;
387 case parentype::quote: nextToken.append("\""); break;
388 }
389
390 open_delimiters.pop();
391 }
392
393 if (open_delimiters.empty())
394 {
395 switch (dt.first.type)
396 {
397 case parentype::paren: result = "(" + result; break;
398 case parentype::square_bracket: result = "[" + result; break;
399 case parentype::asterisk: result = "*" + result; break;
400 case parentype::quote: result = "\"" + result; break;
401 }
402 }
403
404 switch (dt.first.type)
405 {
406 case parentype::paren: nextToken.append(")"); break;
407 case parentype::square_bracket: nextToken.append("]"); break;
408 case parentype::asterisk: nextToken.append("*"); break;
409 case parentype::quote: nextToken.append("\""); break;
410 }
411 }
307 } 412 }
308 } else if (next.word.type == tokentype_hashtag)
309 {
310 int rhash = rand() % hashtags.size();
311 nextToken = hashtags[rhash];
312 } 413 }
313 414
314 if (next.word.terminating) 415 // Terminators
416 if (next.tok.suffix == suffixtype::terminating)
315 { 417 {
316 std::map<int, termstats>& ending = endings[next.word]; 418 nextToken.append(next.tok.w.terms.next());
317 int emax = ending.rbegin()->first; 419 } else if (next.tok.suffix == suffixtype::comma)
318 int er = rand() % emax; 420 {
319 termstats& nextend = ending.upper_bound(er)->second; 421 nextToken.append(",");
320
321 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322 } 422 }
323 423
324 /* DEBUG */ 424 /* DEBUG */
325 printKgram(cur); 425 printKgram(cur);
426 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
427
428 cur.push_back(next.tok);
326 429
327 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; 430 result.append(nextToken + " ");
328 431
329 if (mess) 432 if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330 { 433 {
331 std::cout << " mala " << next.word.canon; 434 break;
332 } 435 }
333
334 std::cout << std::endl;
335
336 cur.push_back(next.word);
337
338 result.push_back(nextToken);
339 } 436 }
340
341 return result;
342}
343
344bool removeIf(char c)
345{
346 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
347}
348
349std::string canonize(std::string f)
350{
351 std::string canonical(f);
352 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353 437
354 std::string result; 438 // Remove the trailing space
355 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 439 if (result.back() == ' ')
440 {
441 result.pop_back();
442 }
443
444 // Close any open delimiters
445 while (!open_delimiters.empty())
446 {
447 switch (open_delimiters.top())
448 {
449 case parentype::paren: result.append(")"); break;
450 case parentype::square_bracket: result.append("]"); break;
451 case parentype::asterisk: result.append("*"); break;
452 case parentype::quote: result.append("\""); break;
453 }
454
455 open_delimiters.pop();
456 }
356 457
357 return result; 458 return result;
358} 459}