diff options
| -rw-r--r-- | ebooks.cpp | 2 | ||||
| -rw-r--r-- | gen.cpp | 2 | ||||
| -rw-r--r-- | histogram.cpp | 2 | ||||
| -rw-r--r-- | kgramstats.cpp | 77 | ||||
| -rw-r--r-- | kgramstats.h | 16 |
5 files changed, 72 insertions, 27 deletions
| diff --git a/ebooks.cpp b/ebooks.cpp index 1371ffc..c183923 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -38,7 +38,7 @@ int main(int argc, char** args) | |||
| 38 | line.pop_back(); | 38 | line.pop_back(); |
| 39 | } | 39 | } |
| 40 | 40 | ||
| 41 | corpus += line + "\n "; | 41 | corpus += line + "\n"; |
| 42 | } | 42 | } |
| 43 | 43 | ||
| 44 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | 44 | // Replace old-style freevars while I can't be bothered to remake the corpus yet |
| diff --git a/gen.cpp b/gen.cpp index 5e2d9db..d802b14 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -42,7 +42,7 @@ int main(int argc, char** args) | |||
| 42 | line.pop_back(); | 42 | line.pop_back(); |
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | corpus += line + "\n "; | 45 | corpus += line + "\n"; |
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | 48 | // Replace old-style freevars while I can't be bothered to remake the corpus yet |
| diff --git a/histogram.cpp b/histogram.cpp index 6d31cf4..77c5c3e 100644 --- a/histogram.cpp +++ b/histogram.cpp | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | #include "histogram.h" | 1 | #include "histogram.h" |
| 2 | #include "kgramstats.h" | ||
| 2 | #include <cstdlib> | 3 | #include <cstdlib> |
| 3 | #include <iostream> | 4 | #include <iostream> |
| 4 | 5 | ||
| @@ -42,3 +43,4 @@ void histogram<T>::print() const | |||
| 42 | } | 43 | } |
| 43 | 44 | ||
| 44 | template class histogram <std::string>; | 45 | template class histogram <std::string>; |
| 46 | template class histogram <rawr::terminator>; | ||
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -133,14 +133,19 @@ void rawr::compile(int maxK) | |||
| 133 | std::cout << per << "%" << std::flush; | 133 | std::cout << per << "%" << std::flush; |
| 134 | } | 134 | } |
| 135 | 135 | ||
| 136 | end = _corpora[i].find(" ", start); | 136 | end = _corpora[i].find_first_of(" \n", start); |
| 137 | 137 | ||
| 138 | bool emoji = false; | 138 | bool emoji = false; |
| 139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); |
| 140 | std::string t = ""; | 140 | std::string t = ""; |
| 141 | 141 | ||
| 142 | if (te.compare("") && te.compare(".")) | 142 | if (te.compare("") && te.compare(".") && te.compare(" ")) |
| 143 | { | 143 | { |
| 144 | if (te.back() == ' ') | ||
| 145 | { | ||
| 146 | te.pop_back(); | ||
| 147 | } | ||
| 148 | |||
| 144 | // Extract strings of emojis into their own tokens even if they're not space delimited | 149 | // Extract strings of emojis into their own tokens even if they're not space delimited |
| 145 | int m = emojis.match(te); | 150 | int m = emojis.match(te); |
| 146 | emoji = m > 0; | 151 | emoji = m > 0; |
| @@ -166,7 +171,7 @@ void rawr::compile(int maxK) | |||
| 166 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 171 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
| 167 | 172 | ||
| 168 | int pst = tc.find_first_not_of("\"([*"); | 173 | int pst = tc.find_first_not_of("\"([*"); |
| 169 | int dst = tc.find_last_not_of("\")]*.,?!\n"); | 174 | int dst = tc.find_last_not_of("\")]*.,?!\n;:"); |
| 170 | std::string canonical(""); | 175 | std::string canonical(""); |
| 171 | if ((pst != std::string::npos) && (dst != std::string::npos)) | 176 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
| 172 | { | 177 | { |
| @@ -270,28 +275,28 @@ void rawr::compile(int maxK) | |||
| 270 | } | 275 | } |
| 271 | } | 276 | } |
| 272 | 277 | ||
| 273 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; | 278 | int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; |
| 274 | if (backtrack != t.length()) | 279 | if (backtrack != t.length()) |
| 275 | { | 280 | { |
| 276 | std::string ending = t.substr(backtrack); | 281 | std::string ending = t.substr(backtrack); |
| 277 | std::string suffix; | 282 | std::string suffix; |
| 283 | bool newline = false; | ||
| 284 | bool terminating = false; | ||
| 278 | 285 | ||
| 279 | for (char c : ending) | 286 | for (char c : ending) |
| 280 | { | 287 | { |
| 281 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | 288 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) |
| 282 | { | 289 | { |
| 283 | suffix += c; | 290 | suffix += c; |
| 291 | terminating = true; | ||
| 284 | 292 | ||
| 285 | continue; | 293 | continue; |
| 286 | } else if (c == '\n') | 294 | } else if (c == '\n') |
| 287 | { | 295 | { |
| 288 | // At least the end is coming | 296 | newline = true; |
| 289 | if (suffix.empty()) | 297 | terminating = true; |
| 290 | { | 298 | |
| 291 | suffix = "."; | 299 | continue; |
| 292 | } | ||
| 293 | |||
| 294 | break; | ||
| 295 | } | 300 | } |
| 296 | 301 | ||
| 297 | parentype pt = ([&] { | 302 | parentype pt = ([&] { |
| @@ -313,13 +318,16 @@ void rawr::compile(int maxK) | |||
| 313 | } | 318 | } |
| 314 | } | 319 | } |
| 315 | 320 | ||
| 316 | if (suffix == ",") | 321 | if (terminating) |
| 317 | { | 322 | { |
| 318 | tk.suffix = suffixtype::comma; | 323 | if ((suffix == ",") && (!newline)) |
| 319 | } else if (!suffix.empty()) { | 324 | { |
| 320 | tk.suffix = suffixtype::terminating; | 325 | tk.suffix = suffixtype::comma; |
| 321 | 326 | } else { | |
| 322 | w.terms.add(suffix); | 327 | tk.suffix = suffixtype::terminating; |
| 328 | |||
| 329 | w.terms.add({suffix, newline}); | ||
| 330 | } | ||
| 323 | } | 331 | } |
| 324 | } | 332 | } |
| 325 | 333 | ||
| @@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t) | |||
| 502 | } | 510 | } |
| 503 | } | 511 | } |
| 504 | 512 | ||
| 513 | std::ostream& operator<<(std::ostream& os, rawr::terminator t) | ||
| 514 | { | ||
| 515 | os << t.form; | ||
| 516 | |||
| 517 | if (t.newline) | ||
| 518 | { | ||
| 519 | os << "↵"; | ||
| 520 | } | ||
| 521 | |||
| 522 | return os; | ||
| 523 | } | ||
| 524 | |||
| 505 | void rawr::setTransformCallback(transform_callback _arg) | 525 | void rawr::setTransformCallback(transform_callback _arg) |
| 506 | { | 526 | { |
| 507 | _transform = _arg; | 527 | _transform = _arg; |
| @@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL) | |||
| 649 | // Terminators | 669 | // Terminators |
| 650 | if (next.tok.suffix == suffixtype::terminating) | 670 | if (next.tok.suffix == suffixtype::terminating) |
| 651 | { | 671 | { |
| 652 | nextToken.append(next.tok.w.terms.next()); | 672 | auto term = next.tok.w.terms.next(); |
| 673 | nextToken.append(term.form); | ||
| 674 | |||
| 675 | if (term.newline) | ||
| 676 | { | ||
| 677 | nextToken.append("\n"); | ||
| 678 | } else { | ||
| 679 | nextToken.append(" "); | ||
| 680 | } | ||
| 653 | } else if (next.tok.suffix == suffixtype::comma) | 681 | } else if (next.tok.suffix == suffixtype::comma) |
| 654 | { | 682 | { |
| 655 | nextToken.append(","); | 683 | nextToken.append(", "); |
| 684 | } else { | ||
| 685 | nextToken.append(" "); | ||
| 656 | } | 686 | } |
| 657 | 687 | ||
| 658 | // If this pick was guaranteed, increase cut chance | 688 | // If this pick was guaranteed, increase cut chance |
| @@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL) | |||
| 665 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | 695 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; |
| 666 | 696 | ||
| 667 | cur.push_back(next.tok); | 697 | cur.push_back(next.tok); |
| 668 | 698 | result.append(nextToken); | |
| 669 | result.append(nextToken + " "); | 699 | |
| 670 | |||
| 671 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 700 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
| 672 | { | 701 | { |
| 673 | break; | 702 | break; |
| diff --git a/kgramstats.h b/kgramstats.h index ee75ada..fc01101 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -19,10 +19,22 @@ class rawr { | |||
| 19 | std::string randomSentence(int maxL); | 19 | std::string randomSentence(int maxL); |
| 20 | 20 | ||
| 21 | private: | 21 | private: |
| 22 | struct terminator { | ||
| 23 | std::string form; | ||
| 24 | bool newline = false; | ||
| 25 | |||
| 26 | terminator(std::string form, bool newline) : form(form), newline(newline) {} | ||
| 27 | |||
| 28 | bool operator<(const terminator& other) const | ||
| 29 | { | ||
| 30 | return std::tie(form, newline) < std::tie(other.form, other.newline); | ||
| 31 | } | ||
| 32 | }; | ||
| 33 | |||
| 22 | struct word { | 34 | struct word { |
| 23 | std::string canon; | 35 | std::string canon; |
| 24 | histogram<std::string> forms; | 36 | histogram<std::string> forms; |
| 25 | histogram<std::string> terms; | 37 | histogram<terminator> terms; |
| 26 | 38 | ||
| 27 | word(std::string canon) : canon(canon) {} | 39 | word(std::string canon) : canon(canon) {} |
| 28 | 40 | ||
| @@ -68,6 +80,7 @@ class rawr { | |||
| 68 | std::map<delimiter, int> delimiters; | 80 | std::map<delimiter, int> delimiters; |
| 69 | suffixtype suffix; | 81 | suffixtype suffix; |
| 70 | std::string raw; | 82 | std::string raw; |
| 83 | bool newline = false; | ||
| 71 | 84 | ||
| 72 | token(const word& w) : w(w), suffix(suffixtype::none) {} | 85 | token(const word& w) : w(w), suffix(suffixtype::none) {} |
| 73 | 86 | ||
| @@ -119,6 +132,7 @@ class rawr { | |||
| 119 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 132 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
| 120 | friend std::ostream& operator<<(std::ostream& os, query q); | 133 | friend std::ostream& operator<<(std::ostream& os, query q); |
| 121 | friend std::ostream& operator<<(std::ostream& os, token t); | 134 | friend std::ostream& operator<<(std::ostream& os, token t); |
| 135 | friend std::ostream& operator<<(std::ostream& os, terminator t); | ||
| 122 | 136 | ||
| 123 | int _maxK; | 137 | int _maxK; |
| 124 | bool _compiled = false; | 138 | bool _compiled = false; |
