diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-29 21:01:07 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-29 21:01:07 -0400 |
commit | accbd7647de118cca7503a1bf0992529a0a76df8 (patch) | |
tree | 0ba1cb8105a21472d0b3beacd0aca699ff9c7ad1 | |
parent | 4d217ac6122120d9e86248432594864e114e3a46 (diff) | |
download | rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.gz rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.bz2 rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.zip |
Newlines, colons, and semicolons are now valid terminators
-rw-r--r-- | ebooks.cpp | 2 | ||||
-rw-r--r-- | gen.cpp | 2 | ||||
-rw-r--r-- | histogram.cpp | 2 | ||||
-rw-r--r-- | kgramstats.cpp | 77 | ||||
-rw-r--r-- | kgramstats.h | 16 |
5 files changed, 72 insertions, 27 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 52247da..2e00f25 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -37,7 +37,7 @@ int main(int argc, char** args) | |||
37 | line.pop_back(); | 37 | line.pop_back(); |
38 | } | 38 | } |
39 | 39 | ||
40 | corpus += line + "\n "; | 40 | corpus += line + "\n"; |
41 | } | 41 | } |
42 | 42 | ||
43 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | 43 | // Replace old-style freevars while I can't be bothered to remake the corpus yet |
diff --git a/gen.cpp b/gen.cpp index 5e2d9db..d802b14 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -42,7 +42,7 @@ int main(int argc, char** args) | |||
42 | line.pop_back(); | 42 | line.pop_back(); |
43 | } | 43 | } |
44 | 44 | ||
45 | corpus += line + "\n "; | 45 | corpus += line + "\n"; |
46 | } | 46 | } |
47 | 47 | ||
48 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | 48 | // Replace old-style freevars while I can't be bothered to remake the corpus yet |
diff --git a/histogram.cpp b/histogram.cpp index 6d31cf4..77c5c3e 100644 --- a/histogram.cpp +++ b/histogram.cpp | |||
@@ -1,4 +1,5 @@ | |||
1 | #include "histogram.h" | 1 | #include "histogram.h" |
2 | #include "kgramstats.h" | ||
2 | #include <cstdlib> | 3 | #include <cstdlib> |
3 | #include <iostream> | 4 | #include <iostream> |
4 | 5 | ||
@@ -42,3 +43,4 @@ void histogram<T>::print() const | |||
42 | } | 43 | } |
43 | 44 | ||
44 | template class histogram <std::string>; | 45 | template class histogram <std::string>; |
46 | template class histogram <rawr::terminator>; | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -133,14 +133,19 @@ void rawr::compile(int maxK) | |||
133 | std::cout << per << "%" << std::flush; | 133 | std::cout << per << "%" << std::flush; |
134 | } | 134 | } |
135 | 135 | ||
136 | end = _corpora[i].find(" ", start); | 136 | end = _corpora[i].find_first_of(" \n", start); |
137 | 137 | ||
138 | bool emoji = false; | 138 | bool emoji = false; |
139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 139 | std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); |
140 | std::string t = ""; | 140 | std::string t = ""; |
141 | 141 | ||
142 | if (te.compare("") && te.compare(".")) | 142 | if (te.compare("") && te.compare(".") && te.compare(" ")) |
143 | { | 143 | { |
144 | if (te.back() == ' ') | ||
145 | { | ||
146 | te.pop_back(); | ||
147 | } | ||
148 | |||
144 | // Extract strings of emojis into their own tokens even if they're not space delimited | 149 | // Extract strings of emojis into their own tokens even if they're not space delimited |
145 | int m = emojis.match(te); | 150 | int m = emojis.match(te); |
146 | emoji = m > 0; | 151 | emoji = m > 0; |
@@ -166,7 +171,7 @@ void rawr::compile(int maxK) | |||
166 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 171 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
167 | 172 | ||
168 | int pst = tc.find_first_not_of("\"([*"); | 173 | int pst = tc.find_first_not_of("\"([*"); |
169 | int dst = tc.find_last_not_of("\")]*.,?!\n"); | 174 | int dst = tc.find_last_not_of("\")]*.,?!\n;:"); |
170 | std::string canonical(""); | 175 | std::string canonical(""); |
171 | if ((pst != std::string::npos) && (dst != std::string::npos)) | 176 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
172 | { | 177 | { |
@@ -270,28 +275,28 @@ void rawr::compile(int maxK) | |||
270 | } | 275 | } |
271 | } | 276 | } |
272 | 277 | ||
273 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; | 278 | int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; |
274 | if (backtrack != t.length()) | 279 | if (backtrack != t.length()) |
275 | { | 280 | { |
276 | std::string ending = t.substr(backtrack); | 281 | std::string ending = t.substr(backtrack); |
277 | std::string suffix; | 282 | std::string suffix; |
283 | bool newline = false; | ||
284 | bool terminating = false; | ||
278 | 285 | ||
279 | for (char c : ending) | 286 | for (char c : ending) |
280 | { | 287 | { |
281 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | 288 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) |
282 | { | 289 | { |
283 | suffix += c; | 290 | suffix += c; |
291 | terminating = true; | ||
284 | 292 | ||
285 | continue; | 293 | continue; |
286 | } else if (c == '\n') | 294 | } else if (c == '\n') |
287 | { | 295 | { |
288 | // At least the end is coming | 296 | newline = true; |
289 | if (suffix.empty()) | 297 | terminating = true; |
290 | { | 298 | |
291 | suffix = "."; | 299 | continue; |
292 | } | ||
293 | |||
294 | break; | ||
295 | } | 300 | } |
296 | 301 | ||
297 | parentype pt = ([&] { | 302 | parentype pt = ([&] { |
@@ -313,13 +318,16 @@ void rawr::compile(int maxK) | |||
313 | } | 318 | } |
314 | } | 319 | } |
315 | 320 | ||
316 | if (suffix == ",") | 321 | if (terminating) |
317 | { | 322 | { |
318 | tk.suffix = suffixtype::comma; | 323 | if ((suffix == ",") && (!newline)) |
319 | } else if (!suffix.empty()) { | 324 | { |
320 | tk.suffix = suffixtype::terminating; | 325 | tk.suffix = suffixtype::comma; |
321 | 326 | } else { | |
322 | w.terms.add(suffix); | 327 | tk.suffix = suffixtype::terminating; |
328 | |||
329 | w.terms.add({suffix, newline}); | ||
330 | } | ||
323 | } | 331 | } |
324 | } | 332 | } |
325 | 333 | ||
@@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t) | |||
502 | } | 510 | } |
503 | } | 511 | } |
504 | 512 | ||
513 | std::ostream& operator<<(std::ostream& os, rawr::terminator t) | ||
514 | { | ||
515 | os << t.form; | ||
516 | |||
517 | if (t.newline) | ||
518 | { | ||
519 | os << "↵"; | ||
520 | } | ||
521 | |||
522 | return os; | ||
523 | } | ||
524 | |||
505 | void rawr::setTransformCallback(transform_callback _arg) | 525 | void rawr::setTransformCallback(transform_callback _arg) |
506 | { | 526 | { |
507 | _transform = _arg; | 527 | _transform = _arg; |
@@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL) | |||
649 | // Terminators | 669 | // Terminators |
650 | if (next.tok.suffix == suffixtype::terminating) | 670 | if (next.tok.suffix == suffixtype::terminating) |
651 | { | 671 | { |
652 | nextToken.append(next.tok.w.terms.next()); | 672 | auto term = next.tok.w.terms.next(); |
673 | nextToken.append(term.form); | ||
674 | |||
675 | if (term.newline) | ||
676 | { | ||
677 | nextToken.append("\n"); | ||
678 | } else { | ||
679 | nextToken.append(" "); | ||
680 | } | ||
653 | } else if (next.tok.suffix == suffixtype::comma) | 681 | } else if (next.tok.suffix == suffixtype::comma) |
654 | { | 682 | { |
655 | nextToken.append(","); | 683 | nextToken.append(", "); |
684 | } else { | ||
685 | nextToken.append(" "); | ||
656 | } | 686 | } |
657 | 687 | ||
658 | // If this pick was guaranteed, increase cut chance | 688 | // If this pick was guaranteed, increase cut chance |
@@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL) | |||
665 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | 695 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; |
666 | 696 | ||
667 | cur.push_back(next.tok); | 697 | cur.push_back(next.tok); |
668 | 698 | result.append(nextToken); | |
669 | result.append(nextToken + " "); | 699 | |
670 | |||
671 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 700 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
672 | { | 701 | { |
673 | break; | 702 | break; |
diff --git a/kgramstats.h b/kgramstats.h index ee75ada..fc01101 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -19,10 +19,22 @@ class rawr { | |||
19 | std::string randomSentence(int maxL); | 19 | std::string randomSentence(int maxL); |
20 | 20 | ||
21 | private: | 21 | private: |
22 | struct terminator { | ||
23 | std::string form; | ||
24 | bool newline = false; | ||
25 | |||
26 | terminator(std::string form, bool newline) : form(form), newline(newline) {} | ||
27 | |||
28 | bool operator<(const terminator& other) const | ||
29 | { | ||
30 | return std::tie(form, newline) < std::tie(other.form, other.newline); | ||
31 | } | ||
32 | }; | ||
33 | |||
22 | struct word { | 34 | struct word { |
23 | std::string canon; | 35 | std::string canon; |
24 | histogram<std::string> forms; | 36 | histogram<std::string> forms; |
25 | histogram<std::string> terms; | 37 | histogram<terminator> terms; |
26 | 38 | ||
27 | word(std::string canon) : canon(canon) {} | 39 | word(std::string canon) : canon(canon) {} |
28 | 40 | ||
@@ -68,6 +80,7 @@ class rawr { | |||
68 | std::map<delimiter, int> delimiters; | 80 | std::map<delimiter, int> delimiters; |
69 | suffixtype suffix; | 81 | suffixtype suffix; |
70 | std::string raw; | 82 | std::string raw; |
83 | bool newline = false; | ||
71 | 84 | ||
72 | token(const word& w) : w(w), suffix(suffixtype::none) {} | 85 | token(const word& w) : w(w), suffix(suffixtype::none) {} |
73 | 86 | ||
@@ -119,6 +132,7 @@ class rawr { | |||
119 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 132 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
120 | friend std::ostream& operator<<(std::ostream& os, query q); | 133 | friend std::ostream& operator<<(std::ostream& os, query q); |
121 | friend std::ostream& operator<<(std::ostream& os, token t); | 134 | friend std::ostream& operator<<(std::ostream& os, token t); |
135 | friend std::ostream& operator<<(std::ostream& os, terminator t); | ||
122 | 136 | ||
123 | int _maxK; | 137 | int _maxK; |
124 | bool _compiled = false; | 138 | bool _compiled = false; |