From b316e309559d7176af6cf0bb7dcd6dbaa83c01cd Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Fri, 29 Jan 2016 12:43:00 -0500 Subject: Rewrote how tokens are handled A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it. --- CMakeLists.txt | 8 +- ebooks.cpp | 15 +- freevars.cpp | 4 +- gen.cpp | 15 +- histogram.cpp | 34 +++++ histogram.h | 19 +++ kgramstats.cpp | 453 +++++++++++++++++++++++++++++++++++---------------------- kgramstats.h | 124 ++++++++-------- 8 files changed, 406 insertions(+), 266 deletions(-) create mode 100644 histogram.cpp create mode 100644 histogram.h diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,14 @@ find_package(curl) if (YamlCpp_FOUND AND CURL_FOUND) add_subdirectory(vendor/twitcurl/libtwitcurl) include_directories(vendor/twitcurl/libtwitcurl) - add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) + add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp) + set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) + set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) else (YamlCpp_FOUND AND CURL_FOUND) message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") endif (YamlCpp_FOUND AND CURL_FOUND) -add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) +add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp) +set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) +set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -44,20 +44,9 @@ int main(int argc, char** args) std::cout << "Generating..." << std::endl; for (;;) { - std::vector doc = stats->randomSentence(rand() % 45 + 5); - std::string hi; - for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) - { - hi += vars->parse(*it) + " "; - } - + std::string doc = stats->randomSentence(rand() % 45 + 5); + std::string hi = vars->parse(doc); hi.resize(140); - - size_t lastperiod = hi.find_last_of(".!?,"); - if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) - { - hi = hi.substr(0, lastperiod+1); - } std::string replyMsg; if (twitter.statusUpdate(hi)) diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp @@ -34,8 +34,8 @@ std::string freevars::parse(std::string in) for (std::map* >::iterator it = vars->begin(); it != vars->end(); it++) { std::string tofind = "$" + it->first + "$"; - size_t fpos = res.find(tofind); - if (fpos != std::string::npos) + size_t fpos; + while ((fpos = res.find(tofind)) != std::string::npos) { int r = rand() % it->second->size(); res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp @@ -52,21 +52,10 @@ int main(int argc, char** args) std::cout << "Generating..." << std::endl; for (;;) { - std::vector doc = stats->randomSentence(rand() % 35 + 15); - std::string hi; - for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) - { - hi += vars->parse(*it) + " "; - } - + std::string doc = stats->randomSentence(rand() % 35 + 15); + std::string hi = vars->parse(doc); hi.resize(140); - size_t lastperiod = hi.find_last_of(".!?,"); - if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) - { - hi = hi.substr(0, lastperiod+1); - } - std::cout << hi << std::endl; getc(stdin); diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp @@ -0,0 +1,34 @@ +#include "histogram.h" +#include + +template +void histogram::add(const T& inst) +{ + freqtable[inst]++; +} + +template +void histogram::compile() +{ + distribution.clear(); + + int max = 0; + for (auto& it : freqtable) + { + max += it.second; + distribution.emplace(max, it.first); + } + + freqtable.clear(); +} + +template +const T& histogram::next() const +{ + int max = distribution.rbegin()->first; + int r = rand() % max; + + return distribution.upper_bound(r)->second; +} + +template class histogram ; diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h @@ -0,0 +1,19 @@ +#ifndef HISTOGRAM_H_24094D97 +#define HISTOGRAM_H_24094D97 + +#include +#include + +template +class histogram { + public: + void add(const T& inst); + void compile(); + const T& next() const; + + private: + std::map freqtable; + std::map distribution; +}; + +#endif /* end of include guard: HISTOGRAM_H_24094D97 */ diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -37,35 +37,11 @@ #include #include #include -#include "malaprop.h" +#include +#include -query wildcardQuery(querytype_sentence); - -std::string canonize(std::string f); - -token token_from_string(std::string in) -{ - if (in[0] == '#') - { - token word(tokentype_hashtag); - - if (in.find_first_of(".?!,") != std::string::npos) - { - word.terminating = true; - } - - return word; - } else { - token word(canonize(in)); - - if (in.find_first_of(".?!,") != std::string::npos) - { - word.terminating = true; - } - - return word; - } -} +query wildcardQuery {querytype::sentence}; +word blank_word {""}; // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant @@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) { this->maxK = maxK; - std::vector tokens; + std::vector tokens; size_t start = 0; int end = 0; std::set thashtags; @@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK) { end = corpus.find(" ", start); - std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); - if (token[token.length()-1] == '\n') + std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); + if (t.compare("") && t.compare(".")) { - if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) + std::string tc(t), canonical; + std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); + std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { + return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); + }); + + word& w = ([&] () -> word& { + // Hashtag freevar + if (canonical[0] == '#') + { + thashtags.insert(canonical); + canonical = "#hashtag"; + + return hashtags; + } + + // Basically any other word + if (words.count(canonical) == 0) + { + words.emplace(canonical, canonical); + } + + word& tw = words.at(canonical); + tw.forms.add(canonical); + + return tw; + })(); + + token tk(w); + tk.raw = t; + + for (char c : t) { - token.insert(token.length()-1, "."); + if (c == '*') + { + tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; + } else if (c == '[') + { + tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; + } else if (c == '(') + { + tk.delimiters[{parentype::paren, doublestatus::opening}]++; + } else if (c == '"') + { + tk.delimiters[{parentype::quote, doublestatus::opening}]++; + } else { + break; + } } - - token.resize(token.length()-1); - } - - if (token.compare("") && token.compare(".")) - { - mstats.addWord(token); - tokens.push_back(token); - if (token[0] == '#') + int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; + if (backtrack != t.length()) { - thashtags.insert(canonize(token)); + std::string ending = t.substr(backtrack); + std::string suffix; + + for (char c : ending) + { + if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + { + suffix += c; + + continue; + } else if (c == '\n') + { + // At least the end is coming + if (suffix.empty()) + { + suffix = "."; + } + + break; + } + + parentype pt = ([&] { + switch (c) + { + case ']': return parentype::square_bracket; + case ')': return parentype::paren; + case '*': return parentype::asterisk; + case '"': return parentype::quote; + } + })(); + + if (tk.delimiters[{pt, doublestatus::opening}] > 0) + { + tk.delimiters[{pt, doublestatus::opening}]--; + tk.delimiters[{pt, doublestatus::both}]++; + } else { + tk.delimiters[{pt, doublestatus::closing}]++; + } + } + + if (suffix == ",") + { + tk.suffix = suffixtype::comma; + } else if (!suffix.empty()) { + tk.suffix = suffixtype::terminating; + + w.terms.add(suffix); + } } + + tokens.push_back(tk); } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } - for (std::set::iterator it = thashtags.begin(); it != thashtags.end(); it++) + // Time to condense the distribution stuff for the words + for (auto& it : words) { - hashtags.push_back(*it); + it.second.forms.compile(); + it.second.terms.compile(); } - + + // Hashtag freevar is not frequency distributed + for (auto& it : thashtags) + { + hashtags.forms.add(it); + } + + hashtags.forms.compile(); + hashtags.terms.compile(); + + // kgram distribution std::map > tstats; - std::map > tendings; for (int k=1; k seq(tokens.begin()+i, tokens.begin()+i+k); - kgram prefix; - - for (std::list::iterator it = seq.begin(); it != seq.end(); it++) - { - prefix.push_back(token_from_string(*it)); - } - - std::string f = tokens[i+k]; - std::string canonical = canonize(f); - - token word(token_from_string(canonical)); - if (f.find_first_of(".?!,") != std::string::npos) + kgram prefix(tokens.begin()+i, tokens.begin()+i+k); + token f = tokens[i+k]; + + if (tstats[prefix].count(f) == 0) { - word.terminating = true; - - char terminator = f[f.find_last_of(".?!,")]; - int occurrences = std::count(f.begin(), f.end(), terminator); - - tendings[word][termstats(terminator, occurrences)]++; + tstats[prefix].emplace(f, f); } - token_data& td = tstats[prefix][word]; - td.word = word; + token_data& td = tstats[prefix].at(f); td.all++; - if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) + if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td.uppercase++; - } else if (isupper(f[0])) + } else if (isupper(f.raw[0])) { td.titlecase++; } - if (prefix.front().word.terminating) + kgram term_prefix; + bool changed = false; + std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) { + if (q.tok.suffix == suffixtype::terminating) + { + changed = true; + + return wildcardQuery; + } else { + return q; + } + }); + + if (changed) { - prefix.front() = wildcardQuery; + if (tstats[term_prefix].count(f) == 0) + { + tstats[term_prefix].emplace(f, f); + } - token_data& td2 = tstats[prefix][word]; - td2.word = word; + token_data& td2 = tstats[term_prefix].at(f); td2.all++; - if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) + if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td2.uppercase++; - } else if (isupper(f[0])) + } else if (isupper(f.raw[0])) { td2.titlecase++; } @@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK) } } - for (std::map >::iterator it = tstats.begin(); it != tstats.end(); it++) + // Condense the kgram distribution + for (auto& it : tstats) { - kgram klist = it->first; - std::map& probtable = it->second; - std::map& distribution = stats[klist]; + kgram klist = it.first; + auto& probtable = it.second; + auto& distribution = stats[klist]; int max = 0; - for (std::map::iterator kt = probtable.begin(); kt != probtable.end(); kt++) + for (auto& kt : probtable) { - max += kt->second.all; + max += kt.second.all; - distribution[max] = kt->second; - } - } - - for (std::map >::iterator it = tendings.begin(); it != tendings.end(); it++) - { - token word = it->first; - std::map& probtable = it->second; - std::map& distribution = endings[word]; - int max = 0; - - for (std::map::iterator kt = probtable.begin(); kt != probtable.end(); kt++) - { - max += kt->second; - - distribution[max] = kt->first; + distribution.emplace(max, kt.second); } } } void printKgram(kgram k) { - for (kgram::iterator it = k.begin(); it != k.end(); it++) + for (auto& q : k) { - query& q = *it; - if (q.type == querytype_sentence) + if (q.type == querytype::sentence) { std::cout << "#.# "; - } else if (q.type == querytype_literal) + } else if (q.type == querytype::literal) { - if (q.word.type == tokentype_hashtag) + if (q.tok.suffix == suffixtype::terminating) { - if (q.word.terminating) - { - std::cout << "#hashtag. "; - } else { - std::cout << "#hashtag "; - } - } else if (q.word.type == tokentype_literal) + std::cout << q.tok.w.canon << ". "; + } else if (q.tok.suffix == suffixtype::comma) { - if (q.word.terminating) - { - std::cout << q.word.canon << ". "; - } else { - std::cout << q.word.canon << " "; - } + std::cout << q.tok.w.canon << ", "; + } else { + std::cout << q.tok.w.canon << " "; } } } } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus -std::vector kgramstats::randomSentence(int n) +std::string kgramstats::randomSentence(int n) { - std::vector result; + std::string result; kgram cur(1, wildcardQuery); int cuts = 0; + std::stack open_delimiters; for (int i=0; i kgramstats::randomSentence(int n) cur = kgram(1, wildcardQuery); } - std::map& distribution = stats[cur]; + auto& distribution = stats[cur]; int max = distribution.rbegin()->first; int r = rand() % max; token_data& next = distribution.upper_bound(r)->second; - std::string nextToken; - bool mess = false; - - if (next.word.type == tokentype_literal) + std::string nextToken = next.tok.w.forms.next(); + + // Determine the casing of the next token. We randomly make the token all + // caps based on the markov chain. Otherwise, we check if the previous + // token is the end of a sentence (terminating token or a wildcard query). + int casing = rand() % next.all; + if (casing < next.uppercase) + { + std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); + } else if ((((cur.rbegin()->type == querytype::sentence) + || ((cur.rbegin()->type == querytype::literal) + && (cur.rbegin()->tok.suffix == suffixtype::terminating))) + && (rand() % 2 > 0)) + || (casing - next.uppercase < next.titlecase)) { - nextToken = next.word.canon; + nextToken[0] = toupper(nextToken[0]); + } - mess = (rand() % 100) == 0; - if (mess) + // Delimiters + for (auto& dt : next.tok.delimiters) + { + if (dt.first.status == doublestatus::both) { - nextToken = mstats.alternate(nextToken); - } - - // Determine the casing of the next token. We randomly make the token all - // caps based on the markov chain. Otherwise, we check if the previous - // token is the end of a sentence (terminating token or a wildcard query). - int casing = rand() % next.all; - if (casing < next.uppercase) + switch (dt.first.type) + { + case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; + case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; + case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; + case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; + } + } else if (dt.first.status == doublestatus::opening) { - std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); - } else if ((((cur.rbegin()->type == querytype_sentence) - || ((cur.rbegin()->type == querytype_literal) - && (cur.rbegin()->word.terminating))) - && (rand() % 2 > 0)) - || (casing - next.uppercase < next.titlecase)) + for (int i=0; i& ending = endings[next.word]; - int emax = ending.rbegin()->first; - int er = rand() % emax; - termstats& nextend = ending.upper_bound(er)->second; - - nextToken.append(std::string(nextend.occurrences, nextend.terminator)); + nextToken.append(next.tok.w.terms.next()); + } else if (next.tok.suffix == suffixtype::comma) + { + nextToken.append(","); } /* DEBUG */ printKgram(cur); + std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; + + cur.push_back(next.tok); - std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; + result.append(nextToken + " "); - if (mess) + if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) { - std::cout << " mala " << next.word.canon; + break; } - - std::cout << std::endl; - - cur.push_back(next.word); - - result.push_back(nextToken); } - - return result; -} - -bool removeIf(char c) -{ - return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); -} - -std::string canonize(std::string f) -{ - std::string canonical(f); - std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); - std::string result; - std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); + // Remove the trailing space + if (result.back() == ' ') + { + result.pop_back(); + } + + // Close any open delimiters + while (!open_delimiters.empty()) + { + switch (open_delimiters.top()) + { + case parentype::paren: result.append(")"); break; + case parentype::square_bracket: result.append("]"); break; + case parentype::asterisk: result.append("*"); break; + case parentype::quote: result.append("\""); break; + } + + open_delimiters.pop(); + } return result; } diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -2,61 +2,89 @@ #include #include #include -#include "malaprop.h" +#include "histogram.h" #ifndef KGRAMSTATS_H #define KGRAMSTATS_H -enum tokentype { - tokentype_literal, - tokentype_hashtag +struct word { + std::string canon; + histogram forms; + histogram terms; + + word(std::string canon) : canon(canon) {} + + bool operator<(const word& other) const + { + return canon < other.canon; + } }; -struct token { - tokentype type; - std::string canon; - bool terminating; +extern word blank_word; + +enum class suffixtype { + none, + terminating, + comma +}; + +enum class parentype { + paren, + square_bracket, + asterisk, + quote +}; + +enum class doublestatus { + opening, + closing, + both +}; + +struct delimiter { + parentype type; + doublestatus status; + + delimiter(parentype type, doublestatus status) : type(type), status(status) {} - token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} - token(tokentype type) : type(type), canon(""), terminating(false) {} + bool operator<(const delimiter& other) const + { + return std::tie(type, status) < std::tie(other.type, other.status); + } +}; + +struct token { + const word& w; + std::map delimiters; + suffixtype suffix; + std::string raw; + + token(const word& w) : w(w), suffix(suffixtype::none) {} bool operator<(const token& other) const { - if (type != other.type) - { - return type < other.type; - } else if (type == tokentype_literal) - { - if (canon == other.canon) - { - return !terminating && other.terminating; - } else { - return canon < other.canon; - } - } else { - return !terminating && other.terminating; - } + return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); } }; -enum querytype { - querytype_literal, - querytype_sentence +enum class querytype { + literal, + sentence }; struct query { querytype type; - token word; + token tok; - query(token word) : word(word), type(querytype_literal) {} + query(token tok) : tok(tok), type(querytype::literal) {} - query(querytype type) : word(""), type(type) {} + query(querytype type) : tok(blank_word), type(type) {} bool operator<(const query& other) const { if (type == other.type) { - return word < other.word; + return tok < other.tok; } else { return type < other.type; } @@ -65,34 +93,11 @@ struct query { typedef std::list kgram; -struct termstats { - char terminator; - int occurrences; - - termstats() : terminator('.'), occurrences(1) {} - - termstats(char terminator, int occurrences) - { - this->terminator = terminator; - this->occurrences = occurrences; - } - - bool operator<(const termstats& other) const - { - if (terminator == other.terminator) - { - return occurrences < other.occurrences; - } else { - return terminator < other.terminator; - } - } -}; - class kgramstats { public: kgramstats(std::string corpus, int maxK); - std::vector randomSentence(int n); + std::string randomSentence(int n); private: struct token_data @@ -100,16 +105,15 @@ private: int all; int titlecase; int uppercase; - token word; + token tok; - token_data() : word(""), all(0), titlecase(0), uppercase(0) {} + token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} }; int maxK; std::map > stats; - malaprop mstats; - std::map > endings; - std::vector hashtags; + word hashtags {"#hashtag"}; + std::map words; }; void printKgram(kgram k); -- cgit 1.4.1