From accbd7647de118cca7503a1bf0992529a0a76df8 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sun, 29 May 2016 21:01:07 -0400 Subject: Newlines, colons, and semicolons are now valid terminators --- ebooks.cpp | 2 +- gen.cpp | 2 +- histogram.cpp | 2 ++ kgramstats.cpp | 77 ++++++++++++++++++++++++++++++++++++++++------------------ kgramstats.h | 16 +++++++++++- 5 files changed, 72 insertions(+), 27 deletions(-) diff --git a/ebooks.cpp b/ebooks.cpp index 52247da..2e00f25 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -37,7 +37,7 @@ int main(int argc, char** args) line.pop_back(); } - corpus += line + "\n "; + corpus += line + "\n"; } // Replace old-style freevars while I can't be bothered to remake the corpus yet diff --git a/gen.cpp b/gen.cpp index 5e2d9db..d802b14 100644 --- a/gen.cpp +++ b/gen.cpp @@ -42,7 +42,7 @@ int main(int argc, char** args) line.pop_back(); } - corpus += line + "\n "; + corpus += line + "\n"; } // Replace old-style freevars while I can't be bothered to remake the corpus yet diff --git a/histogram.cpp b/histogram.cpp index 6d31cf4..77c5c3e 100644 --- a/histogram.cpp +++ b/histogram.cpp @@ -1,4 +1,5 @@ #include "histogram.h" +#include "kgramstats.h" #include #include @@ -42,3 +43,4 @@ void histogram::print() const } template class histogram ; +template class histogram ; diff --git a/kgramstats.cpp b/kgramstats.cpp index 47f3bc0..e0c2eac 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -133,14 +133,19 @@ void rawr::compile(int maxK) std::cout << per << "%" << std::flush; } - end = _corpora[i].find(" ", start); + end = _corpora[i].find_first_of(" \n", start); bool emoji = false; - std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); + std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); std::string t = ""; - if (te.compare("") && te.compare(".")) + if (te.compare("") && te.compare(".") && te.compare(" ")) { + if (te.back() == ' ') + { + te.pop_back(); + } + // Extract strings of emojis into their own tokens even if they're not space delimited int m = emojis.match(te); emoji = m > 0; @@ -166,7 +171,7 @@ void rawr::compile(int maxK) std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); int pst = tc.find_first_not_of("\"([*"); - int dst = tc.find_last_not_of("\")]*.,?!\n"); + int dst = tc.find_last_not_of("\")]*.,?!\n;:"); std::string canonical(""); if ((pst != std::string::npos) && (dst != std::string::npos)) { @@ -270,28 +275,28 @@ void rawr::compile(int maxK) } } - int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; + int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; if (backtrack != t.length()) { std::string ending = t.substr(backtrack); std::string suffix; + bool newline = false; + bool terminating = false; for (char c : ending) { - if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) { suffix += c; + terminating = true; continue; } else if (c == '\n') { - // At least the end is coming - if (suffix.empty()) - { - suffix = "."; - } - - break; + newline = true; + terminating = true; + + continue; } parentype pt = ([&] { @@ -313,13 +318,16 @@ void rawr::compile(int maxK) } } - if (suffix == ",") + if (terminating) { - tk.suffix = suffixtype::comma; - } else if (!suffix.empty()) { - tk.suffix = suffixtype::terminating; - - w.terms.add(suffix); + if ((suffix == ",") && (!newline)) + { + tk.suffix = suffixtype::comma; + } else { + tk.suffix = suffixtype::terminating; + + w.terms.add({suffix, newline}); + } } } @@ -502,6 +510,18 @@ std::ostream& operator<<(std::ostream& os, rawr::token t) } } +std::ostream& operator<<(std::ostream& os, rawr::terminator t) +{ + os << t.form; + + if (t.newline) + { + os << "↵"; + } + + return os; +} + void rawr::setTransformCallback(transform_callback _arg) { _transform = _arg; @@ -649,10 +669,20 @@ std::string rawr::randomSentence(int maxL) // Terminators if (next.tok.suffix == suffixtype::terminating) { - nextToken.append(next.tok.w.terms.next()); + auto term = next.tok.w.terms.next(); + nextToken.append(term.form); + + if (term.newline) + { + nextToken.append("\n"); + } else { + nextToken.append(" "); + } } else if (next.tok.suffix == suffixtype::comma) { - nextToken.append(","); + nextToken.append(", "); + } else { + nextToken.append(" "); } // If this pick was guaranteed, increase cut chance @@ -665,9 +695,8 @@ std::string rawr::randomSentence(int maxL) std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; cur.push_back(next.tok); - - result.append(nextToken + " "); - + result.append(nextToken); + if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) { break; diff --git a/kgramstats.h b/kgramstats.h index ee75ada..fc01101 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -19,10 +19,22 @@ class rawr { std::string randomSentence(int maxL); private: + struct terminator { + std::string form; + bool newline = false; + + terminator(std::string form, bool newline) : form(form), newline(newline) {} + + bool operator<(const terminator& other) const + { + return std::tie(form, newline) < std::tie(other.form, other.newline); + } + }; + struct word { std::string canon; histogram forms; - histogram terms; + histogram terms; word(std::string canon) : canon(canon) {} @@ -68,6 +80,7 @@ class rawr { std::map delimiters; suffixtype suffix; std::string raw; + bool newline = false; token(const word& w) : w(w), suffix(suffixtype::none) {} @@ -119,6 +132,7 @@ class rawr { friend std::ostream& operator<<(std::ostream& os, kgram k); friend std::ostream& operator<<(std::ostream& os, query q); friend std::ostream& operator<<(std::ostream& os, token t); + friend std::ostream& operator<<(std::ostream& os, terminator t); int _maxK; bool _compiled = false; -- cgit 1.4.1