about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-29 21:01:07 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-29 21:01:07 -0400
commitaccbd7647de118cca7503a1bf0992529a0a76df8 (patch)
tree0ba1cb8105a21472d0b3beacd0aca699ff9c7ad1
parent4d217ac6122120d9e86248432594864e114e3a46 (diff)
downloadrawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.gz
rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.tar.bz2
rawr-ebooks-accbd7647de118cca7503a1bf0992529a0a76df8.zip
Newlines, colons, and semicolons are now valid terminators
-rw-r--r--ebooks.cpp2
-rw-r--r--gen.cpp2
-rw-r--r--histogram.cpp2
-rw-r--r--kgramstats.cpp77
-rw-r--r--kgramstats.h16
5 files changed, 72 insertions, 27 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 52247da..2e00f25 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -37,7 +37,7 @@ int main(int argc, char** args)
37 line.pop_back(); 37 line.pop_back();
38 } 38 }
39 39
40 corpus += line + "\n "; 40 corpus += line + "\n";
41 } 41 }
42 42
43 // Replace old-style freevars while I can't be bothered to remake the corpus yet 43 // Replace old-style freevars while I can't be bothered to remake the corpus yet
diff --git a/gen.cpp b/gen.cpp index 5e2d9db..d802b14 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -42,7 +42,7 @@ int main(int argc, char** args)
42 line.pop_back(); 42 line.pop_back();
43 } 43 }
44 44
45 corpus += line + "\n "; 45 corpus += line + "\n";
46 } 46 }
47 47
48 // Replace old-style freevars while I can't be bothered to remake the corpus yet 48 // Replace old-style freevars while I can't be bothered to remake the corpus yet
diff --git a/histogram.cpp b/histogram.cpp index 6d31cf4..77c5c3e 100644 --- a/histogram.cpp +++ b/histogram.cpp
@@ -1,4 +1,5 @@
1#include "histogram.h" 1#include "histogram.h"
2#include "kgramstats.h"
2#include <cstdlib> 3#include <cstdlib>
3#include <iostream> 4#include <iostream>
4 5
@@ -42,3 +43,4 @@ void histogram<T>::print() const
42} 43}
43 44
44template class histogram <std::string>; 45template class histogram <std::string>;
46template class histogram <rawr::terminator>;
diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -133,14 +133,19 @@ void rawr::compile(int maxK)
133 std::cout << per << "%" << std::flush; 133 std::cout << per << "%" << std::flush;
134 } 134 }
135 135
136 end = _corpora[i].find(" ", start); 136 end = _corpora[i].find_first_of(" \n", start);
137 137
138 bool emoji = false; 138 bool emoji = false;
139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); 139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1);
140 std::string t = ""; 140 std::string t = "";
141 141
142 if (te.compare("") && te.compare(".")) 142 if (te.compare("") && te.compare(".") && te.compare(" "))
143 { 143 {
144 if (te.back() == ' ')
145 {
146 te.pop_back();
147 }
148
144 // Extract strings of emojis into their own tokens even if they're not space delimited 149 // Extract strings of emojis into their own tokens even if they're not space delimited
145 int m = emojis.match(te); 150 int m = emojis.match(te);
146 emoji = m > 0; 151 emoji = m > 0;
@@ -166,7 +171,7 @@ void rawr::compile(int maxK)
166 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 171 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
167 172
168 int pst = tc.find_first_not_of("\"([*"); 173 int pst = tc.find_first_not_of("\"([*");
169 int dst = tc.find_last_not_of("\")]*.,?!\n"); 174 int dst = tc.find_last_not_of("\")]*.,?!\n;:");
170 std::string canonical(""); 175 std::string canonical("");
171 if ((pst != std::string::npos) && (dst != std::string::npos)) 176 if ((pst != std::string::npos) && (dst != std::string::npos))
172 { 177 {
@@ -270,28 +275,28 @@ void rawr::compile(int maxK)
270 } 275 }
271 } 276 }
272 277
273 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; 278 int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1;
274 if (backtrack != t.length()) 279 if (backtrack != t.length())
275 { 280 {
276 std::string ending = t.substr(backtrack); 281 std::string ending = t.substr(backtrack);
277 std::string suffix; 282 std::string suffix;
283 bool newline = false;
284 bool terminating = false;
278 285
279 for (char c : ending) 286 for (char c : ending)
280 { 287 {
281 if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) 288 if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':'))
282 { 289 {
283 suffix += c; 290 suffix += c;
291 terminating = true;
284 292
285 continue; 293 continue;
286 } else if (c == '\n') 294 } else if (c == '\n')
287 { 295 {
288 // At least the end is coming 296 newline = true;
289 if (suffix.empty()) 297 terminating = true;
290 { 298
291 suffix = "."; 299 continue;
292 }
293
294 break;
295 } 300 }
296 301
297 parentype pt = ([&] { 302 parentype pt = ([&] {
@@ -313,13 +318,16 @@ void rawr::compile(int maxK)
313 } 318 }
314 } 319 }
315 320
316 if (suffix == ",") 321 if (terminating)
317 { 322 {
318 tk.suffix = suffixtype::comma; 323 if ((suffix == ",") && (!newline))
319 } else if (!suffix.empty()) { 324 {
320 tk.suffix = suffixtype::terminating; 325 tk.suffix = suffixtype::comma;
321 326 } else {
322 w.terms.add(suffix); 327 tk.suffix = suffixtype::terminating;
328
329 w.terms.add({suffix, newline});
330 }
323 } 331 }
324 } 332 }
325 333
@@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t)
502 } 510 }
503} 511}
504 512
513std::ostream& operator<<(std::ostream& os, rawr::terminator t)
514{
515 os << t.form;
516
517 if (t.newline)
518 {
519 os << "↵";
520 }
521
522 return os;
523}
524
505void rawr::setTransformCallback(transform_callback _arg) 525void rawr::setTransformCallback(transform_callback _arg)
506{ 526{
507 _transform = _arg; 527 _transform = _arg;
@@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL)
649 // Terminators 669 // Terminators
650 if (next.tok.suffix == suffixtype::terminating) 670 if (next.tok.suffix == suffixtype::terminating)
651 { 671 {
652 nextToken.append(next.tok.w.terms.next()); 672 auto term = next.tok.w.terms.next();
673 nextToken.append(term.form);
674
675 if (term.newline)
676 {
677 nextToken.append("\n");
678 } else {
679 nextToken.append(" ");
680 }
653 } else if (next.tok.suffix == suffixtype::comma) 681 } else if (next.tok.suffix == suffixtype::comma)
654 { 682 {
655 nextToken.append(","); 683 nextToken.append(", ");
684 } else {
685 nextToken.append(" ");
656 } 686 }
657 687
658 // If this pick was guaranteed, increase cut chance 688 // If this pick was guaranteed, increase cut chance
@@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL)
665 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; 695 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
666 696
667 cur.push_back(next.tok); 697 cur.push_back(next.tok);
668 698 result.append(nextToken);
669 result.append(nextToken + " "); 699
670
671 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) 700 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
672 { 701 {
673 break; 702 break;
diff --git a/kgramstats.h b/kgramstats.h index ee75ada..fc01101 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -19,10 +19,22 @@ class rawr {
19 std::string randomSentence(int maxL); 19 std::string randomSentence(int maxL);
20 20
21 private: 21 private:
22 struct terminator {
23 std::string form;
24 bool newline = false;
25
26 terminator(std::string form, bool newline) : form(form), newline(newline) {}
27
28 bool operator<(const terminator& other) const
29 {
30 return std::tie(form, newline) < std::tie(other.form, other.newline);
31 }
32 };
33
22 struct word { 34 struct word {
23 std::string canon; 35 std::string canon;
24 histogram<std::string> forms; 36 histogram<std::string> forms;
25 histogram<std::string> terms; 37 histogram<terminator> terms;
26 38
27 word(std::string canon) : canon(canon) {} 39 word(std::string canon) : canon(canon) {}
28 40
@@ -68,6 +80,7 @@ class rawr {
68 std::map<delimiter, int> delimiters; 80 std::map<delimiter, int> delimiters;
69 suffixtype suffix; 81 suffixtype suffix;
70 std::string raw; 82 std::string raw;
83 bool newline = false;
71 84
72 token(const word& w) : w(w), suffix(suffixtype::none) {} 85 token(const word& w) : w(w), suffix(suffixtype::none) {}
73 86
@@ -119,6 +132,7 @@ class rawr {
119 friend std::ostream& operator<<(std::ostream& os, kgram k); 132 friend std::ostream& operator<<(std::ostream& os, kgram k);
120 friend std::ostream& operator<<(std::ostream& os, query q); 133 friend std::ostream& operator<<(std::ostream& os, query q);
121 friend std::ostream& operator<<(std::ostream& os, token t); 134 friend std::ostream& operator<<(std::ostream& os, token t);
135 friend std::ostream& operator<<(std::ostream& os, terminator t);
122 136
123 int _maxK; 137 int _maxK;
124 bool _compiled = false; 138 bool _compiled = false;