From 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Fri, 20 May 2016 23:14:06 -0400 Subject: Pulled the ebooks functionality out into a library --- kgramstats.cpp | 465 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 252 insertions(+), 213 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index a44bf2b..47f3bc0 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -33,32 +33,47 @@ // #include "kgramstats.h" -#include #include -#include #include #include #include #include -#include "freevars.h" -#include #include "prefix_search.h" #include +#include + +const rawr::query rawr::wildcardQuery = {querytype::sentence}; +const rawr::word rawr::blank_word = {""}; -query wildcardQuery {querytype::sentence}; -word blank_word {""}; +void rawr::addCorpus(std::string corpus) +{ + _corpora.push_back(corpus); +} // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant -kgramstats::kgramstats(std::string corpus, int maxK) +void rawr::compile(int maxK) { - this->maxK = maxK; + _maxK = maxK; std::vector tokens; size_t start = 0; - int end = 0; std::set thashtags; - freevar fv_emoticons {emoticons, "emoticons.txt"}; + std::set fv_emoticons; + + std::ifstream fvefile("emoticons.txt"); + if (fvefile) + { + std::string line; + while (getline(fvefile, line)) + { + fv_emoticons.insert(line); + emoticons.forms.add(line); + } + } + + fvefile.close(); + std::map canonical_form; AspellConfig* spell_config = new_aspell_config(); @@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK) } std::cout << "Tokenizing corpus... 0%" << std::flush; - int len = corpus.length(); + int len = 0; + for (auto c : _corpora) + { + len += c.length(); + } + + int startper = 0; int per = 0; int perprime = 0; std::cout.fill(' '); - while (end != std::string::npos) + for (int i = 0; i < _corpora.size(); i++) { - perprime = end * 100 / len; - if (perprime != per) + int end = 0; + + while (end != std::string::npos) { - per = perprime; + perprime = (startper + end) * 100 / len; + if (perprime != per) + { + per = perprime; - std::cout << "\b\b\b\b" << std::right; - std::cout.width(3); - std::cout << per << "%" << std::flush; - } + std::cout << "\b\b\b\b" << std::right; + std::cout.width(3); + std::cout << per << "%" << std::flush; + } - end = corpus.find(" ", start); + end = _corpora[i].find(" ", start); - bool emoji = false; - std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); - std::string t = ""; + bool emoji = false; + std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); + std::string t = ""; - if (te.compare("") && te.compare(".")) - { - // Extract strings of emojis into their own tokens even if they're not space delimited - int m = emojis.match(te); - emoji = m > 0; - if (m == 0) m = 1; - t = te.substr(0,m); - te = te.substr(m); - - while (!te.empty()) + if (te.compare("") && te.compare(".")) { - m = emojis.match(te); - if (emoji == (m > 0)) + // Extract strings of emojis into their own tokens even if they're not space delimited + int m = emojis.match(te); + emoji = m > 0; + if (m == 0) m = 1; + t = te.substr(0,m); + te = te.substr(m); + + while (!te.empty()) { - if (m == 0) m = 1; - t += te.substr(0,m); - te = te.substr(m); - } else { - end = start + t.length() - 1; - break; + m = emojis.match(te); + if (emoji == (m > 0)) + { + if (m == 0) m = 1; + t += te.substr(0,m); + te = te.substr(m); + } else { + end = start + t.length() - 1; + break; + } } - } - std::string tc(t); - std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); + std::string tc(t); + std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); - int pst = tc.find_first_not_of("\"([*"); - int dst = tc.find_last_not_of("\")]*.,?!\n"); - std::string canonical(""); - if ((pst != std::string::npos) && (dst != std::string::npos)) - { - canonical = std::string(tc, pst, dst - pst + 1); - } - - word& w = ([&] () -> word& { - // Hashtag freevar - if (canonical[0] == '#') + int pst = tc.find_first_not_of("\"([*"); + int dst = tc.find_last_not_of("\")]*.,?!\n"); + std::string canonical(""); + if ((pst != std::string::npos) && (dst != std::string::npos)) { - thashtags.insert(canonical); - - return hashtags; + canonical = std::string(tc, pst, dst - pst + 1); } - - // Emoticon freevar - if (emoji) - { - emoticons.forms.add(canonical); + + word& w = ([&] () -> word& { + // Hashtag freevar + if (canonical[0] == '#') + { + thashtags.insert(canonical); - return emoticons; - } + return hashtags; + } - if ((pst != std::string::npos) && (dst != std::string::npos)) - { - std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); - if (fv_emoticons.check(emoticon_canon)) + // Emoticon freevar + if (emoji) { - emoticons.forms.add(emoticon_canon); + emoticons.forms.add(canonical); return emoticons; } - } - // Basically any other word - if (canonical_form.count(canonical) == 0) - { - if ( - // Legacy freevars should be distinct from tokens containing similar words - (canonical.find("$name$") != std::string::npos) - // Words with no letters will be mangled by the spell checker - || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) - ) + if ((pst != std::string::npos) && (dst != std::string::npos)) { - canonical_form[canonical] = canonical; - words.emplace(canonical, canonical); - } else { - int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); - if (correct) + std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); + if (fv_emoticons.count(emoticon_canon) == 1) + { + emoticons.forms.add(emoticon_canon); + + return emoticons; + } + } + + // Basically any other word + if (canonical_form.count(canonical) == 0) + { + if ( + // Legacy freevars should be distinct from tokens containing similar words + (canonical.find("$name$") != std::string::npos) + // Words with no letters will be mangled by the spell checker + || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) + ) { - words.emplace(canonical, canonical); canonical_form[canonical] = canonical; + words.emplace(canonical, canonical); } else { - const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); - AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); - const char* replacement = aspell_string_enumeration_next(elements); - if (replacement != NULL) + int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); + if (correct) { - std::string sugrep(replacement); - canonical_form[canonical] = sugrep; - - if (words.count(sugrep) == 0) - { - words.emplace(sugrep, sugrep); - } - } else { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; - } + } else { + const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); + AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); + const char* replacement = aspell_string_enumeration_next(elements); + if (replacement != NULL) + { + std::string sugrep(replacement); + canonical_form[canonical] = sugrep; - delete_aspell_string_enumeration(elements); + if (words.count(sugrep) == 0) + { + words.emplace(sugrep, sugrep); + } + } else { + words.emplace(canonical, canonical); + canonical_form[canonical] = canonical; + } + + delete_aspell_string_enumeration(elements); + } } } - } - word& tw = words.at(canonical_form.at(canonical)); - tw.forms.add(canonical); + word& tw = words.at(canonical_form.at(canonical)); + tw.forms.add(canonical); - return tw; - })(); + return tw; + })(); - token tk(w); - tk.raw = t; + token tk(w); + tk.raw = t; - for (char c : t) - { - if (c == '*') + for (char c : t) { - tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; - } else if (c == '[') - { - tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; - } else if (c == '(') - { - tk.delimiters[{parentype::paren, doublestatus::opening}]++; - } else if (c == '"') - { - tk.delimiters[{parentype::quote, doublestatus::opening}]++; - } else { - break; + if (c == '*') + { + tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; + } else if (c == '[') + { + tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; + } else if (c == '(') + { + tk.delimiters[{parentype::paren, doublestatus::opening}]++; + } else if (c == '"') + { + tk.delimiters[{parentype::quote, doublestatus::opening}]++; + } else { + break; + } } - } - int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; - if (backtrack != t.length()) - { - std::string ending = t.substr(backtrack); - std::string suffix; - - for (char c : ending) + int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; + if (backtrack != t.length()) { - if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + std::string ending = t.substr(backtrack); + std::string suffix; + + for (char c : ending) { - suffix += c; + if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + { + suffix += c; - continue; - } else if (c == '\n') - { - // At least the end is coming - if (suffix.empty()) + continue; + } else if (c == '\n') { - suffix = "."; - } + // At least the end is coming + if (suffix.empty()) + { + suffix = "."; + } - break; - } + break; + } + + parentype pt = ([&] { + switch (c) + { + case ']': return parentype::square_bracket; + case ')': return parentype::paren; + case '*': return parentype::asterisk; + case '"': return parentype::quote; + } + })(); - parentype pt = ([&] { - switch (c) + if (tk.delimiters[{pt, doublestatus::opening}] > 0) { - case ']': return parentype::square_bracket; - case ')': return parentype::paren; - case '*': return parentype::asterisk; - case '"': return parentype::quote; + tk.delimiters[{pt, doublestatus::opening}]--; + tk.delimiters[{pt, doublestatus::both}]++; + } else { + tk.delimiters[{pt, doublestatus::closing}]++; } - })(); - - if (tk.delimiters[{pt, doublestatus::opening}] > 0) - { - tk.delimiters[{pt, doublestatus::opening}]--; - tk.delimiters[{pt, doublestatus::both}]++; - } else { - tk.delimiters[{pt, doublestatus::closing}]++; } - } - if (suffix == ",") - { - tk.suffix = suffixtype::comma; - } else if (!suffix.empty()) { - tk.suffix = suffixtype::terminating; + if (suffix == ",") + { + tk.suffix = suffixtype::comma; + } else if (!suffix.empty()) { + tk.suffix = suffixtype::terminating; - w.terms.add(suffix); + w.terms.add(suffix); + } } - } - tokens.push_back(tk); - } + tokens.push_back(tk); + } - start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); + start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); + } + + startper += _corpora[i].length(); } std::cout << "\b\b\b\b100%" << std::endl; @@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) kgram klist = it.first; auto& probtable = it.second; - auto& distribution = stats[klist]; + auto& distribution = _stats[klist]; int max = 0; for (auto& kt : probtable) @@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK) } std::cout << "\b\b\b\b100%" << std::endl; + + _compiled = true; } -void printKgram(kgram k) +std::ostream& operator<<(std::ostream& os, rawr::kgram k) { for (auto& q : k) { - if (q.type == querytype::sentence) - { - std::cout << "#.# "; - } else if (q.type == querytype::literal) - { - if (q.tok.suffix == suffixtype::terminating) - { - std::cout << q.tok.w.canon << ". "; - } else if (q.tok.suffix == suffixtype::comma) - { - std::cout << q.tok.w.canon << ", "; - } else { - std::cout << q.tok.w.canon << " "; - } - } + os << q << " "; } + + return os; +} + +std::ostream& operator<<(std::ostream& os, rawr::query q) +{ + if (q.type == rawr::querytype::sentence) + { + return os << "#.#"; + } else if (q.type == rawr::querytype::literal) + { + return os << q.tok; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, rawr::token t) +{ + os << t.w.canon; + + if (t.suffix == rawr::suffixtype::terminating) + { + return os << "."; + } else if (t.suffix == rawr::suffixtype::comma) + { + return os << ","; + } else { + return os; + } +} + +void rawr::setTransformCallback(transform_callback _arg) +{ + _transform = _arg; } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus -std::string kgramstats::randomSentence(int maxL) +std::string rawr::randomSentence(int maxL) { + if (!_compiled) + { + return ""; + } + std::string result; kgram cur(1, wildcardQuery); int cuts = 0; @@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL) for (;;) { - if (cur.size() == maxK) + if (cur.size() == _maxK) { cur.pop_front(); } if (cur.size() > 0) { - if (rand() % (maxK - cur.size() + 1) == 0) + if (rand() % (_maxK - cur.size() + 1) == 0) { while ((cur.size() > 2) && (cuts > 0)) { @@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL) // Gotta circumvent the last line of the input corpus // https://twitter.com/starla4444/status/684222271339237376 - if (stats.count(cur) == 0) + if (_stats.count(cur) == 0) { cur = kgram(1, wildcardQuery); } - auto& distribution = stats[cur]; + auto& distribution = _stats[cur]; int max = distribution.rbegin()->first; int r = rand() % max; token_data& next = distribution.upper_bound(r)->second; std::string nextToken = next.tok.w.forms.next(); + + // Apply user-specified transforms + if (_transform) + { + nextToken = _transform(next.tok.w.canon, nextToken); + } // Determine the casing of the next token. We randomly make the token all // caps based on the markov chain. Otherwise, we check if the previous @@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL) } /* DEBUG */ - printKgram(cur); - std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; + std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; cur.push_back(next.tok); @@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL) open_delimiters.pop(); } - // Replace old-style freevars while I can't be bothered to remake the corpus yet - std::vector fv_names; - std::ifstream namefile("names.txt"); - if (namefile.is_open()) - { - while (!namefile.eof()) - { - std::string l; - getline(namefile, l); - if (l.back() == '\r') - { - l.pop_back(); - } - - fv_names.push_back(l); - } - - int cpos; - while ((cpos = result.find("$name$")) != std::string::npos) - { - result.replace(cpos, 6, fv_names[rand() % fv_names.size()]); - } - } + result.resize(maxL); return result; } -- cgit 1.4.1