about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-29 21:01:07 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-29 21:01:07 -0400
commitaccbd7647de118cca7503a1bf0992529a0a76df8 (patch)
tree0ba1cb8105a21472d0b3beacd0aca699ff9c7ad1 /kgramstats.cpp
parent4d217ac6122120d9e86248432594864e114e3a46 (diff)
downloadrawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.gz
rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.bz2
rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.zip
Newlines, colons, and semicolons are now valid terminators
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp77
1 files changed, 53 insertions, 24 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -133,14 +133,19 @@ void rawr::compile(int maxK)
133 std::cout << per << "%" << std::flush; 133 std::cout << per << "%" << std::flush;
134 } 134 }
135 135
136 end = _corpora[i].find(" ", start); 136 end = _corpora[i].find_first_of(" \n", start);
137 137
138 bool emoji = false; 138 bool emoji = false;
139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); 139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
140 std::string t = ""; 140 std::string t = "";
141 141
142 if (te.compare("") && te.compare(".")) 142 if (te.compare("") && te.compare(".") && te.compare(" "))
143 { 143 {
144 if (te.back() == ' ')
145 {
146 te.pop_back();
147 }
148
144 // Extract strings of emojis into their own tokens even if they're not space delimited 149 // Extract strings of emojis into their own tokens even if they're not space delimited
145 int m = emojis.match(te); 150 int m = emojis.match(te);
146 emoji = m > 0; 151 emoji = m > 0;
@@ -166,7 +171,7 @@ void rawr::compile(int maxK)
166 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 171 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
167 172
168 int pst = tc.find_first_not_of("\"([*"); 173 int pst = tc.find_first_not_of("\"([*");
169 int dst = tc.find_last_not_of("\")]*.,?!\n"); 174 int dst = tc.find_last_not_of("\")]*.,?!\n;:");
170 std::string canonical(""); 175 std::string canonical("");
171 if ((pst != std::string::npos) && (dst != std::string::npos)) 176 if ((pst != std::string::npos) && (dst != std::string::npos))
172 { 177 {
@@ -270,28 +275,28 @@ void rawr::compile(int maxK)
270 } 275 }
271 } 276 }
272 277
273 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; 278 int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
274 if (backtrack != t.length()) 279 if (backtrack != t.length())
275 { 280 {
276 std::string ending = t.substr(backtrack); 281 std::string ending = t.substr(backtrack);
277 std::string suffix; 282 std::string suffix;
283 bool newline = false;
284 bool terminating = false;
278 285
279 for (char c : ending) 286 for (char c : ending)
280 { 287 {
281 if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) 288 if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
282 { 289 {
283 suffix += c; 290 suffix += c;
291 terminating = true;
284 292
285 continue; 293 continue;
286 } else if (c == '\n') 294 } else if (c == '\n')
287 { 295 {
288 // At least the end is coming 296 newline = true;
289 if (suffix.empty()) 297 terminating = true;
290 { 298
291 suffix = "."; 299 continue;
292 }
293
294 break;
295 } 300 }
296 301
297 parentype pt = ([&] { 302 parentype pt = ([&] {
@@ -313,13 +318,16 @@ void rawr::compile(int maxK)
313 } 318 }
314 } 319 }
315 320
316 if (suffix == ",") 321 if (terminating)
317 { 322 {
318 tk.suffix = suffixtype::comma; 323 if ((suffix == ",") && (!newline))
319 } else if (!suffix.empty()) { 324 {
320 tk.suffix = suffixtype::terminating; 325 tk.suffix = suffixtype::comma;
321 326 } else {
322 w.terms.add(suffix); 327 tk.suffix = suffixtype::terminating;
328
329 w.terms.add({suffix, newline});
330 }
323 } 331 }
324 } 332 }
325 333
@@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t)
502 } 510 }
503} 511}
504 512
513std::ostream& operator<<(std::ostream& os, rawr::terminator t)
514{
515 os << t.form;
516
517 if (t.newline)
518 {
519 os << "↵";
520 }
521
522 return os;
523}
524
505void rawr::setTransformCallback(transform_callback _arg) 525void rawr::setTransformCallback(transform_callback _arg)
506{ 526{
507 _transform = _arg; 527 _transform = _arg;
@@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL)
649 // Terminators 669 // Terminators
650 if (next.tok.suffix == suffixtype::terminating) 670 if (next.tok.suffix == suffixtype::terminating)
651 { 671 {
652 nextToken.append(next.tok.w.terms.next()); 672 auto term = next.tok.w.terms.next();
673 nextToken.append(term.form);
674
675 if (term.newline)
676 {
677 nextToken.append("\n");
678 } else {
679 nextToken.append(" ");
680 }
653 } else if (next.tok.suffix == suffixtype::comma) 681 } else if (next.tok.suffix == suffixtype::comma)
654 { 682 {
655 nextToken.append(","); 683 nextToken.append(", ");
684 } else {
685 nextToken.append(" ");
656 } 686 }
657 687
658 // If this pick was guaranteed, increase cut chance 688 // If this pick was guaranteed, increase cut chance
@@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL)
665 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; 695 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
666 696
667 cur.push_back(next.tok); 697 cur.push_back(next.tok);
668 698 result.append(nextToken);
669 result.append(nextToken + " "); 699
670
671 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) 700 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
672 { 701 {
673 break; 702 break;