diff options
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 77 |
1 files changed, 53 insertions, 24 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -133,14 +133,19 @@ void rawr::compile(int maxK) | |||
133 | std::cout << per << "%" << std::flush; | 133 | std::cout << per << "%" << std::flush; |
134 | } | 134 | } |
135 | 135 | ||
136 | end = _corpora[i].find(" ", start); | 136 | end = _corpora[i].find_first_of(" \n", start); |
137 | 137 | ||
138 | bool emoji = false; | 138 | bool emoji = false; |
139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); |
140 | std::string t = ""; | 140 | std::string t = ""; |
141 | 141 | ||
142 | if (te.compare("") && te.compare(".")) | 142 | if (te.compare("") && te.compare(".") && te.compare(" ")) |
143 | { | 143 | { |
144 | if (te.back() == ' ') | ||
145 | { | ||
146 | te.pop_back(); | ||
147 | } | ||
148 | |||
144 | // Extract strings of emojis into their own tokens even if they're not space delimited | 149 | // Extract strings of emojis into their own tokens even if they're not space delimited |
145 | int m = emojis.match(te); | 150 | int m = emojis.match(te); |
146 | emoji = m > 0; | 151 | emoji = m > 0; |
@@ -166,7 +171,7 @@ void rawr::compile(int maxK) | |||
166 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 171 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
167 | 172 | ||
168 | int pst = tc.find_first_not_of("\"([*"); | 173 | int pst = tc.find_first_not_of("\"([*"); |
169 | int dst = tc.find_last_not_of("\")]*.,?!\n"); | 174 | int dst = tc.find_last_not_of("\")]*.,?!\n;:"); |
170 | std::string canonical(""); | 175 | std::string canonical(""); |
171 | if ((pst != std::string::npos) && (dst != std::string::npos)) | 176 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
172 | { | 177 | { |
@@ -270,28 +275,28 @@ void rawr::compile(int maxK) | |||
270 | } | 275 | } |
271 | } | 276 | } |
272 | 277 | ||
273 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; | 278 | int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; |
274 | if (backtrack != t.length()) | 279 | if (backtrack != t.length()) |
275 | { | 280 | { |
276 | std::string ending = t.substr(backtrack); | 281 | std::string ending = t.substr(backtrack); |
277 | std::string suffix; | 282 | std::string suffix; |
283 | bool newline = false; | ||
284 | bool terminating = false; | ||
278 | 285 | ||
279 | for (char c : ending) | 286 | for (char c : ending) |
280 | { | 287 | { |
281 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | 288 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) |
282 | { | 289 | { |
283 | suffix += c; | 290 | suffix += c; |
291 | terminating = true; | ||
284 | 292 | ||
285 | continue; | 293 | continue; |
286 | } else if (c == '\n') | 294 | } else if (c == '\n') |
287 | { | 295 | { |
288 | // At least the end is coming | 296 | newline = true; |
289 | if (suffix.empty()) | 297 | terminating = true; |
290 | { | 298 | |
291 | suffix = "."; | 299 | continue; |
292 | } | ||
293 | |||
294 | break; | ||
295 | } | 300 | } |
296 | 301 | ||
297 | parentype pt = ([&] { | 302 | parentype pt = ([&] { |
@@ -313,13 +318,16 @@ void rawr::compile(int maxK) | |||
313 | } | 318 | } |
314 | } | 319 | } |
315 | 320 | ||
316 | if (suffix == ",") | 321 | if (terminating) |
317 | { | 322 | { |
318 | tk.suffix = suffixtype::comma; | 323 | if ((suffix == ",") && (!newline)) |
319 | } else if (!suffix.empty()) { | 324 | { |
320 | tk.suffix = suffixtype::terminating; | 325 | tk.suffix = suffixtype::comma; |
321 | 326 | } else { | |
322 | w.terms.add(suffix); | 327 | tk.suffix = suffixtype::terminating; |
328 | |||
329 | w.terms.add({suffix, newline}); | ||
330 | } | ||
323 | } | 331 | } |
324 | } | 332 | } |
325 | 333 | ||
@@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t) | |||
502 | } | 510 | } |
503 | } | 511 | } |
504 | 512 | ||
513 | std::ostream& operator<<(std::ostream& os, rawr::terminator t) | ||
514 | { | ||
515 | os << t.form; | ||
516 | |||
517 | if (t.newline) | ||
518 | { | ||
519 | os << "↵"; | ||
520 | } | ||
521 | |||
522 | return os; | ||
523 | } | ||
524 | |||
505 | void rawr::setTransformCallback(transform_callback _arg) | 525 | void rawr::setTransformCallback(transform_callback _arg) |
506 | { | 526 | { |
507 | _transform = _arg; | 527 | _transform = _arg; |
@@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL) | |||
649 | // Terminators | 669 | // Terminators |
650 | if (next.tok.suffix == suffixtype::terminating) | 670 | if (next.tok.suffix == suffixtype::terminating) |
651 | { | 671 | { |
652 | nextToken.append(next.tok.w.terms.next()); | 672 | auto term = next.tok.w.terms.next(); |
673 | nextToken.append(term.form); | ||
674 | |||
675 | if (term.newline) | ||
676 | { | ||
677 | nextToken.append("\n"); | ||
678 | } else { | ||
679 | nextToken.append(" "); | ||
680 | } | ||
653 | } else if (next.tok.suffix == suffixtype::comma) | 681 | } else if (next.tok.suffix == suffixtype::comma) |
654 | { | 682 | { |
655 | nextToken.append(","); | 683 | nextToken.append(", "); |
684 | } else { | ||
685 | nextToken.append(" "); | ||
656 | } | 686 | } |
657 | 687 | ||
658 | // If this pick was guaranteed, increase cut chance | 688 | // If this pick was guaranteed, increase cut chance |
@@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL) | |||
665 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | 695 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; |
666 | 696 | ||
667 | cur.push_back(next.tok); | 697 | cur.push_back(next.tok); |
668 | 698 | result.append(nextToken); | |
669 | result.append(nextToken + " "); | 699 | |
670 | |||
671 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 700 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
672 | { | 701 | { |
673 | break; | 702 | break; |