about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:14:06 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:15:10 -0400
commit8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.cpp
parenta9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
downloadrawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip
Pulled the ebooks functionality out into a library
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp465
1 files changed, 252 insertions, 213 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index a44bf2b..47f3bc0 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -33,32 +33,47 @@
33// 33//
34 34
35#include "kgramstats.h" 35#include "kgramstats.h"
36#include <vector>
37#include <iostream> 36#include <iostream>
38#include <cstdlib>
39#include <cstring> 37#include <cstring>
40#include <algorithm> 38#include <algorithm>
41#include <set> 39#include <set>
42#include <stack> 40#include <stack>
43#include "freevars.h"
44#include <fstream>
45#include "prefix_search.h" 41#include "prefix_search.h"
46#include <aspell.h> 42#include <aspell.h>
43#include <fstream>
44
45const rawr::query rawr::wildcardQuery = {querytype::sentence};
46const rawr::word rawr::blank_word = {""};
47 47
48query wildcardQuery {querytype::sentence}; 48void rawr::addCorpus(std::string corpus)
49word blank_word {""}; 49{
50 _corpora.push_back(corpus);
51}
50 52
51// runs in O(t^2) time where t is the number of tokens in the input corpus 53// runs in O(t^2) time where t is the number of tokens in the input corpus
52// We consider maxK to be fairly constant 54// We consider maxK to be fairly constant
53kgramstats::kgramstats(std::string corpus, int maxK) 55void rawr::compile(int maxK)
54{ 56{
55 this->maxK = maxK; 57 _maxK = maxK;
56 58
57 std::vector<token> tokens; 59 std::vector<token> tokens;
58 size_t start = 0; 60 size_t start = 0;
59 int end = 0;
60 std::set<std::string> thashtags; 61 std::set<std::string> thashtags;
61 freevar fv_emoticons {emoticons, "emoticons.txt"}; 62 std::set<std::string> fv_emoticons;
63
64 std::ifstream fvefile("emoticons.txt");
65 if (fvefile)
66 {
67 std::string line;
68 while (getline(fvefile, line))
69 {
70 fv_emoticons.insert(line);
71 emoticons.forms.add(line);
72 }
73 }
74
75 fvefile.close();
76
62 std::map<std::string, std::string> canonical_form; 77 std::map<std::string, std::string> canonical_form;
63 78
64 AspellConfig* spell_config = new_aspell_config(); 79 AspellConfig* spell_config = new_aspell_config();
@@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK)
92 } 107 }
93 108
94 std::cout << "Tokenizing corpus... 0%" << std::flush; 109 std::cout << "Tokenizing corpus... 0%" << std::flush;
95 int len = corpus.length(); 110 int len = 0;
111 for (auto c : _corpora)
112 {
113 len += c.length();
114 }
115
116 int startper = 0;
96 int per = 0; 117 int per = 0;
97 int perprime = 0; 118 int perprime = 0;
98 std::cout.fill(' '); 119 std::cout.fill(' ');
99 while (end != std::string::npos) 120 for (int i = 0; i < _corpora.size(); i++)
100 { 121 {
101 perprime = end * 100 / len; 122 int end = 0;
102 if (perprime != per) 123
124 while (end != std::string::npos)
103 { 125 {
104 per = perprime; 126 perprime = (startper + end) * 100 / len;
127 if (perprime != per)
128 {
129 per = perprime;
105 130
106 std::cout << "\b\b\b\b" << std::right; 131 std::cout << "\b\b\b\b" << std::right;
107 std::cout.width(3); 132 std::cout.width(3);
108 std::cout << per << "%" << std::flush; 133 std::cout << per << "%" << std::flush;
109 } 134 }
110 135
111 end = corpus.find(" ", start); 136 end = _corpora[i].find(" ", start);
112 137
113 bool emoji = false; 138 bool emoji = false;
114 std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start);
115 std::string t = ""; 140 std::string t = "";
116 141
117 if (te.compare("") && te.compare(".")) 142 if (te.compare("") && te.compare("."))
118 {
119 // Extract strings of emojis into their own tokens even if they're not space delimited
120 int m = emojis.match(te);
121 emoji = m > 0;
122 if (m == 0) m = 1;
123 t = te.substr(0,m);
124 te = te.substr(m);
125
126 while (!te.empty())
127 { 143 {
128 m = emojis.match(te); 144 // Extract strings of emojis into their own tokens even if they're not space delimited
129 if (emoji == (m > 0)) 145 int m = emojis.match(te);
146 emoji = m > 0;
147 if (m == 0) m = 1;
148 t = te.substr(0,m);
149 te = te.substr(m);
150
151 while (!te.empty())
130 { 152 {
131 if (m == 0) m = 1; 153 m = emojis.match(te);
132 t += te.substr(0,m); 154 if (emoji == (m > 0))
133 te = te.substr(m); 155 {
134 } else { 156 if (m == 0) m = 1;
135 end = start + t.length() - 1; 157 t += te.substr(0,m);
136 break; 158 te = te.substr(m);
159 } else {
160 end = start + t.length() - 1;
161 break;
162 }
137 } 163 }
138 }
139 164
140 std::string tc(t); 165 std::string tc(t);
141 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 166 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
142 167
143 int pst = tc.find_first_not_of("\"([*"); 168 int pst = tc.find_first_not_of("\"([*");
144 int dst = tc.find_last_not_of("\")]*.,?!\n"); 169 int dst = tc.find_last_not_of("\")]*.,?!\n");
145 std::string canonical(""); 170 std::string canonical("");
146 if ((pst != std::string::npos) && (dst != std::string::npos)) 171 if ((pst != std::string::npos) && (dst != std::string::npos))
147 {
148 canonical = std::string(tc, pst, dst - pst + 1);
149 }
150
151 word& w = ([&] () -> word& {
152 // Hashtag freevar
153 if (canonical[0] == '#')
154 { 172 {
155 thashtags.insert(canonical); 173 canonical = std::string(tc, pst, dst - pst + 1);
156
157 return hashtags;
158 } 174 }
159 175
160 // Emoticon freevar 176 word& w = ([&] () -> word& {
161 if (emoji) 177 // Hashtag freevar
162 { 178 if (canonical[0] == '#')
163 emoticons.forms.add(canonical); 179 {
180 thashtags.insert(canonical);
164 181
165 return emoticons; 182 return hashtags;
166 } 183 }
167 184
168 if ((pst != std::string::npos) && (dst != std::string::npos)) 185 // Emoticon freevar
169 { 186 if (emoji)
170 std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
171 if (fv_emoticons.check(emoticon_canon))
172 { 187 {
173 emoticons.forms.add(emoticon_canon); 188 emoticons.forms.add(canonical);
174 189
175 return emoticons; 190 return emoticons;
176 } 191 }
177 }
178 192
179 // Basically any other word 193 if ((pst != std::string::npos) && (dst != std::string::npos))
180 if (canonical_form.count(canonical) == 0)
181 {
182 if (
183 // Legacy freevars should be distinct from tokens containing similar words
184 (canonical.find("$name$") != std::string::npos)
185 // Words with no letters will be mangled by the spell checker
186 || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
187 )
188 { 194 {
189 canonical_form[canonical] = canonical; 195 std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
190 words.emplace(canonical, canonical); 196 if (fv_emoticons.count(emoticon_canon) == 1)
191 } else { 197 {
192 int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); 198 emoticons.forms.add(emoticon_canon);
193 if (correct) 199
200 return emoticons;
201 }
202 }
203
204 // Basically any other word
205 if (canonical_form.count(canonical) == 0)
206 {
207 if (
208 // Legacy freevars should be distinct from tokens containing similar words
209 (canonical.find("$name$") != std::string::npos)
210 // Words with no letters will be mangled by the spell checker
211 || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
212 )
194 { 213 {
195 words.emplace(canonical, canonical);
196 canonical_form[canonical] = canonical; 214 canonical_form[canonical] = canonical;
215 words.emplace(canonical, canonical);
197 } else { 216 } else {
198 const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); 217 int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
199 AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); 218 if (correct)
200 const char* replacement = aspell_string_enumeration_next(elements);
201 if (replacement != NULL)
202 { 219 {
203 std::string sugrep(replacement);
204 canonical_form[canonical] = sugrep;
205
206 if (words.count(sugrep) == 0)
207 {
208 words.emplace(sugrep, sugrep);
209 }
210 } else {
211 words.emplace(canonical, canonical); 220 words.emplace(canonical, canonical);
212 canonical_form[canonical] = canonical; 221 canonical_form[canonical] = canonical;
213 } 222 } else {
223 const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
224 AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
225 const char* replacement = aspell_string_enumeration_next(elements);
226 if (replacement != NULL)
227 {
228 std::string sugrep(replacement);
229 canonical_form[canonical] = sugrep;
214 230
215 delete_aspell_string_enumeration(elements); 231 if (words.count(sugrep) == 0)
232 {
233 words.emplace(sugrep, sugrep);
234 }
235 } else {
236 words.emplace(canonical, canonical);
237 canonical_form[canonical] = canonical;
238 }
239
240 delete_aspell_string_enumeration(elements);
241 }
216 } 242 }
217 } 243 }
218 }
219 244
220 word& tw = words.at(canonical_form.at(canonical)); 245 word& tw = words.at(canonical_form.at(canonical));
221 tw.forms.add(canonical); 246 tw.forms.add(canonical);
222 247
223 return tw; 248 return tw;
224 })(); 249 })();
225 250
226 token tk(w); 251 token tk(w);
227 tk.raw = t; 252 tk.raw = t;
228 253
229 for (char c : t) 254 for (char c : t)
230 {
231 if (c == '*')
232 { 255 {
233 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; 256 if (c == '*')
234 } else if (c == '[') 257 {
235 { 258 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
236 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; 259 } else if (c == '[')
237 } else if (c == '(') 260 {
238 { 261 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
239 tk.delimiters[{parentype::paren, doublestatus::opening}]++; 262 } else if (c == '(')
240 } else if (c == '"') 263 {
241 { 264 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
242 tk.delimiters[{parentype::quote, doublestatus::opening}]++; 265 } else if (c == '"')
243 } else { 266 {
244 break; 267 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
268 } else {
269 break;
270 }
245 } 271 }
246 }
247 272
248 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; 273 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
249 if (backtrack != t.length()) 274 if (backtrack != t.length())
250 {
251 std::string ending = t.substr(backtrack);
252 std::string suffix;
253
254 for (char c : ending)
255 { 275 {
256 if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) 276 std::string ending = t.substr(backtrack);
277 std::string suffix;
278
279 for (char c : ending)
257 { 280 {
258 suffix += c; 281 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
282 {
283 suffix += c;
259 284
260 continue; 285 continue;
261 } else if (c == '\n') 286 } else if (c == '\n')
262 {
263 // At least the end is coming
264 if (suffix.empty())
265 { 287 {
266 suffix = "."; 288 // At least the end is coming
267 } 289 if (suffix.empty())
290 {
291 suffix = ".";
292 }
268 293
269 break; 294 break;
270 } 295 }
296
297 parentype pt = ([&] {
298 switch (c)
299 {
300 case ']': return parentype::square_bracket;
301 case ')': return parentype::paren;
302 case '*': return parentype::asterisk;
303 case '"': return parentype::quote;
304 }
305 })();
271 306
272 parentype pt = ([&] { 307 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
273 switch (c)
274 { 308 {
275 case ']': return parentype::square_bracket; 309 tk.delimiters[{pt, doublestatus::opening}]--;
276 case ')': return parentype::paren; 310 tk.delimiters[{pt, doublestatus::both}]++;
277 case '*': return parentype::asterisk; 311 } else {
278 case '"': return parentype::quote; 312 tk.delimiters[{pt, doublestatus::closing}]++;
279 } 313 }
280 })();
281
282 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
283 {
284 tk.delimiters[{pt, doublestatus::opening}]--;
285 tk.delimiters[{pt, doublestatus::both}]++;
286 } else {
287 tk.delimiters[{pt, doublestatus::closing}]++;
288 } 314 }
289 }
290 315
291 if (suffix == ",") 316 if (suffix == ",")
292 { 317 {
293 tk.suffix = suffixtype::comma; 318 tk.suffix = suffixtype::comma;
294 } else if (!suffix.empty()) { 319 } else if (!suffix.empty()) {
295 tk.suffix = suffixtype::terminating; 320 tk.suffix = suffixtype::terminating;
296 321
297 w.terms.add(suffix); 322 w.terms.add(suffix);
323 }
298 } 324 }
299 }
300 325
301 tokens.push_back(tk); 326 tokens.push_back(tk);
302 } 327 }
303 328
304 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 329 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
330 }
331
332 startper += _corpora[i].length();
305 } 333 }
306 334
307 std::cout << "\b\b\b\b100%" << std::endl; 335 std::cout << "\b\b\b\b100%" << std::endl;
@@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
420 448
421 kgram klist = it.first; 449 kgram klist = it.first;
422 auto& probtable = it.second; 450 auto& probtable = it.second;
423 auto& distribution = stats[klist]; 451 auto& distribution = _stats[klist];
424 int max = 0; 452 int max = 0;
425 453
426 for (auto& kt : probtable) 454 for (auto& kt : probtable)
@@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK)
432 } 460 }
433 461
434 std::cout << "\b\b\b\b100%" << std::endl; 462 std::cout << "\b\b\b\b100%" << std::endl;
463
464 _compiled = true;
435} 465}
436 466
437void printKgram(kgram k) 467std::ostream& operator<<(std::ostream& os, rawr::kgram k)
438{ 468{
439 for (auto& q : k) 469 for (auto& q : k)
440 { 470 {
441 if (q.type == querytype::sentence) 471 os << q << " ";
442 {
443 std::cout << "#.# ";
444 } else if (q.type == querytype::literal)
445 {
446 if (q.tok.suffix == suffixtype::terminating)
447 {
448 std::cout << q.tok.w.canon << ". ";
449 } else if (q.tok.suffix == suffixtype::comma)
450 {
451 std::cout << q.tok.w.canon << ", ";
452 } else {
453 std::cout << q.tok.w.canon << " ";
454 }
455 }
456 } 472 }
473
474 return os;
475}
476
477std::ostream& operator<<(std::ostream& os, rawr::query q)
478{
479 if (q.type == rawr::querytype::sentence)
480 {
481 return os << "#.#";
482 } else if (q.type == rawr::querytype::literal)
483 {
484 return os << q.tok;
485 }
486
487 return os;
488}
489
490std::ostream& operator<<(std::ostream& os, rawr::token t)
491{
492 os << t.w.canon;
493
494 if (t.suffix == rawr::suffixtype::terminating)
495 {
496 return os << ".";
497 } else if (t.suffix == rawr::suffixtype::comma)
498 {
499 return os << ",";
500 } else {
501 return os;
502 }
503}
504
505void rawr::setTransformCallback(transform_callback _arg)
506{
507 _transform = _arg;
457} 508}
458 509
459// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 510// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
460std::string kgramstats::randomSentence(int maxL) 511std::string rawr::randomSentence(int maxL)
461{ 512{
513 if (!_compiled)
514 {
515 return "";
516 }
517
462 std::string result; 518 std::string result;
463 kgram cur(1, wildcardQuery); 519 kgram cur(1, wildcardQuery);
464 int cuts = 0; 520 int cuts = 0;
@@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL)
466 522
467 for (;;) 523 for (;;)
468 { 524 {
469 if (cur.size() == maxK) 525 if (cur.size() == _maxK)
470 { 526 {
471 cur.pop_front(); 527 cur.pop_front();
472 } 528 }
473 529
474 if (cur.size() > 0) 530 if (cur.size() > 0)
475 { 531 {
476 if (rand() % (maxK - cur.size() + 1) == 0) 532 if (rand() % (_maxK - cur.size() + 1) == 0)
477 { 533 {
478 while ((cur.size() > 2) && (cuts > 0)) 534 while ((cur.size() > 2) && (cuts > 0))
479 { 535 {
@@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL)
490 546
491 // Gotta circumvent the last line of the input corpus 547 // Gotta circumvent the last line of the input corpus
492 // https://twitter.com/starla4444/status/684222271339237376 548 // https://twitter.com/starla4444/status/684222271339237376
493 if (stats.count(cur) == 0) 549 if (_stats.count(cur) == 0)
494 { 550 {
495 cur = kgram(1, wildcardQuery); 551 cur = kgram(1, wildcardQuery);
496 } 552 }
497 553
498 auto& distribution = stats[cur]; 554 auto& distribution = _stats[cur];
499 int max = distribution.rbegin()->first; 555 int max = distribution.rbegin()->first;
500 int r = rand() % max; 556 int r = rand() % max;
501 token_data& next = distribution.upper_bound(r)->second; 557 token_data& next = distribution.upper_bound(r)->second;
502 std::string nextToken = next.tok.w.forms.next(); 558 std::string nextToken = next.tok.w.forms.next();
559
560 // Apply user-specified transforms
561 if (_transform)
562 {
563 nextToken = _transform(next.tok.w.canon, nextToken);
564 }
503 565
504 // Determine the casing of the next token. We randomly make the token all 566 // Determine the casing of the next token. We randomly make the token all
505 // caps based on the markov chain. Otherwise, we check if the previous 567 // caps based on the markov chain. Otherwise, we check if the previous
@@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL)
600 } 662 }
601 663
602 /* DEBUG */ 664 /* DEBUG */
603 printKgram(cur); 665 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
604 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
605 666
606 cur.push_back(next.tok); 667 cur.push_back(next.tok);
607 668
@@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL)
633 open_delimiters.pop(); 694 open_delimiters.pop();
634 } 695 }
635 696
636 // Replace old-style freevars while I can't be bothered to remake the corpus yet 697 result.resize(maxL);
637 std::vector<std::string> fv_names;
638 std::ifstream namefile("names.txt");
639 if (namefile.is_open())
640 {
641 while (!namefile.eof())
642 {
643 std::string l;
644 getline(namefile, l);
645 if (l.back() == '\r')
646 {
647 l.pop_back();
648 }
649
650 fv_names.push_back(l);
651 }
652
653 int cpos;
654 while ((cpos = result.find("$name$")) != std::string::npos)
655 {
656 result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
657 }
658 }
659 698
660 return result; 699 return result;
661} 700}