From 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Fri, 20 May 2016 23:14:06 -0400 Subject: Pulled the ebooks functionality out into a library --- CMakeLists.txt | 13 +- ebooks.cpp | 38 ++++- freevars.cpp | 32 ---- freevars.h | 22 --- gen.cpp | 40 ++++- kgramstats.cpp | 465 +++++++++++++++++++++++++++++++-------------------------- kgramstats.h | 201 +++++++++++++------------ rawr.h | 6 + 8 files changed, 443 insertions(+), 374 deletions(-) delete mode 100644 freevars.cpp delete mode 100644 freevars.h create mode 100644 rawr.h diff --git a/CMakeLists.txt b/CMakeLists.txt index ab1979f..a3f51af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,12 +12,17 @@ include_directories(vendor/yaml-cpp/include) find_package(ASPELL REQUIRED) include_directories(${ASPELL_INCLUDE_DIR}) -add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) +add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp) +set_property(TARGET rawr PROPERTY CXX_STANDARD 11) +set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) +target_link_libraries(rawr ${ASPELL_LIBRARIES}) + +add_executable(rawr-ebooks ebooks.cpp) set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) -target_link_libraries(rawr-ebooks yaml-cpp twitter++ curlcpp curl ${ASPELL_LIBRARIES} pthread) +target_link_libraries(rawr-ebooks rawr yaml-cpp twitter++ curlcpp curl pthread) -add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) +add_executable(rawr-gen gen.cpp) set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) -target_link_libraries(rawr-gen ${ASPELL_LIBRARIES}) +target_link_libraries(rawr-gen rawr) diff --git a/ebooks.cpp b/ebooks.cpp index aa690c2..c01cdc9 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -39,9 +39,41 @@ int main(int argc, char** args) corpus += line + "\n "; } + + // Replace old-style freevars while I can't be bothered to remake the corpus yet + std::vector fv_names; + std::ifstream namefile("names.txt"); + if (namefile.is_open()) + { + while (!namefile.eof()) + { + std::string l; + getline(namefile, l); + if (l.back() == '\r') + { + l.pop_back(); + } + + fv_names.push_back(l); + } + } + + namefile.close(); std::cout << "Preprocessing corpus..." << std::endl; - kgramstats* stats = new kgramstats(corpus, 4); + rawr kgramstats; + kgramstats.addCorpus(corpus); + kgramstats.compile(4); + kgramstats.setTransformCallback([&] (std::string canonical, std::string) { + size_t pos = canonical.find("$name$"); + if (pos != std::string::npos) + { + canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]); + } + + return canonical; + }); + std::mutex stats_mutex; client.setUserStreamNotifyCallback([&] (twitter::notification n) { @@ -60,7 +92,7 @@ int main(int argc, char** args) std::string doc = "@" + n.getTweet().getAuthor().getScreenName() + " "; { std::lock_guard stats_lock(stats_mutex); - doc += stats->randomSentence(140 - doc.length()); + doc += kgramstats.randomSentence(140 - doc.length()); doc.resize(140); } @@ -84,7 +116,7 @@ int main(int argc, char** args) std::string doc; { std::lock_guard stats_lock(stats_mutex); - doc = stats->randomSentence(140); + doc = kgramstats.randomSentence(140); } doc.resize(140); diff --git a/freevars.cpp b/freevars.cpp deleted file mode 100644 index 4429d00..0000000 --- a/freevars.cpp +++ /dev/null @@ -1,32 +0,0 @@ -#include "freevars.h" -#include -#include "kgramstats.h" - -freevar::freevar(word& w, std::string file) : w(w) -{ - std::ifstream infile(file); - if (infile) - { - std::string line; - while (getline(infile, line)) - { - instances.insert(line); - w.forms.add(line); - } - } -} - -bool freevar::check(std::string f) const -{ - return (instances.count(f) == 1); -} - -void freevar::add(std::string f) -{ - instances.insert(f); -} - -word& freevar::getWord() -{ - return w; -} diff --git a/freevars.h b/freevars.h deleted file mode 100644 index f800220..0000000 --- a/freevars.h +++ /dev/null @@ -1,22 +0,0 @@ -#include -#include - -#ifndef FREEVARS_H -#define FREEVARS_H - -class word; - -class freevar -{ - public: - freevar(word& w, std::string file); - bool check(std::string f) const; - void add(std::string f); - word& getWord(); - - private: - word& w; - std::set instances; -}; - -#endif \ No newline at end of file diff --git a/gen.cpp b/gen.cpp index 0319283..eba0277 100644 --- a/gen.cpp +++ b/gen.cpp @@ -44,18 +44,48 @@ int main(int argc, char** args) corpus += line + "\n "; } + + // Replace old-style freevars while I can't be bothered to remake the corpus yet + std::vector fv_names; + std::ifstream namefile("names.txt"); + if (namefile.is_open()) + { + while (!namefile.eof()) + { + std::string l; + getline(namefile, l); + if (l.back() == '\r') + { + l.pop_back(); + } + + fv_names.push_back(l); + } + } + + namefile.close(); std::cout << "Preprocessing corpus..." << std::endl; - kgramstats* stats = new kgramstats(corpus, 4); + rawr kgramstats; + kgramstats.addCorpus(corpus); + kgramstats.compile(4); + kgramstats.setTransformCallback([&] (std::string canonical, std::string) { + size_t pos = canonical.find("$name$"); + if (pos != std::string::npos) + { + canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]); + } + + return canonical; + }); std::cout << "Generating..." << std::endl; for (;;) { - std::string doc = stats->randomSentence(140); - std::string hi = doc; - hi.resize(140); + std::string doc = kgramstats.randomSentence(140); + doc.resize(140); - std::cout << hi << std::endl; + std::cout << doc << std::endl; getc(stdin); } diff --git a/kgramstats.cpp b/kgramstats.cpp index a44bf2b..47f3bc0 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -33,32 +33,47 @@ // #include "kgramstats.h" -#include #include -#include #include #include #include #include -#include "freevars.h" -#include #include "prefix_search.h" #include +#include + +const rawr::query rawr::wildcardQuery = {querytype::sentence}; +const rawr::word rawr::blank_word = {""}; -query wildcardQuery {querytype::sentence}; -word blank_word {""}; +void rawr::addCorpus(std::string corpus) +{ + _corpora.push_back(corpus); +} // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant -kgramstats::kgramstats(std::string corpus, int maxK) +void rawr::compile(int maxK) { - this->maxK = maxK; + _maxK = maxK; std::vector tokens; size_t start = 0; - int end = 0; std::set thashtags; - freevar fv_emoticons {emoticons, "emoticons.txt"}; + std::set fv_emoticons; + + std::ifstream fvefile("emoticons.txt"); + if (fvefile) + { + std::string line; + while (getline(fvefile, line)) + { + fv_emoticons.insert(line); + emoticons.forms.add(line); + } + } + + fvefile.close(); + std::map canonical_form; AspellConfig* spell_config = new_aspell_config(); @@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK) } std::cout << "Tokenizing corpus... 0%" << std::flush; - int len = corpus.length(); + int len = 0; + for (auto c : _corpora) + { + len += c.length(); + } + + int startper = 0; int per = 0; int perprime = 0; std::cout.fill(' '); - while (end != std::string::npos) + for (int i = 0; i < _corpora.size(); i++) { - perprime = end * 100 / len; - if (perprime != per) + int end = 0; + + while (end != std::string::npos) { - per = perprime; + perprime = (startper + end) * 100 / len; + if (perprime != per) + { + per = perprime; - std::cout << "\b\b\b\b" << std::right; - std::cout.width(3); - std::cout << per << "%" << std::flush; - } + std::cout << "\b\b\b\b" << std::right; + std::cout.width(3); + std::cout << per << "%" << std::flush; + } - end = corpus.find(" ", start); + end = _corpora[i].find(" ", start); - bool emoji = false; - std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); - std::string t = ""; + bool emoji = false; + std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start); + std::string t = ""; - if (te.compare("") && te.compare(".")) - { - // Extract strings of emojis into their own tokens even if they're not space delimited - int m = emojis.match(te); - emoji = m > 0; - if (m == 0) m = 1; - t = te.substr(0,m); - te = te.substr(m); - - while (!te.empty()) + if (te.compare("") && te.compare(".")) { - m = emojis.match(te); - if (emoji == (m > 0)) + // Extract strings of emojis into their own tokens even if they're not space delimited + int m = emojis.match(te); + emoji = m > 0; + if (m == 0) m = 1; + t = te.substr(0,m); + te = te.substr(m); + + while (!te.empty()) { - if (m == 0) m = 1; - t += te.substr(0,m); - te = te.substr(m); - } else { - end = start + t.length() - 1; - break; + m = emojis.match(te); + if (emoji == (m > 0)) + { + if (m == 0) m = 1; + t += te.substr(0,m); + te = te.substr(m); + } else { + end = start + t.length() - 1; + break; + } } - } - std::string tc(t); - std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); + std::string tc(t); + std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); - int pst = tc.find_first_not_of("\"([*"); - int dst = tc.find_last_not_of("\")]*.,?!\n"); - std::string canonical(""); - if ((pst != std::string::npos) && (dst != std::string::npos)) - { - canonical = std::string(tc, pst, dst - pst + 1); - } - - word& w = ([&] () -> word& { - // Hashtag freevar - if (canonical[0] == '#') + int pst = tc.find_first_not_of("\"([*"); + int dst = tc.find_last_not_of("\")]*.,?!\n"); + std::string canonical(""); + if ((pst != std::string::npos) && (dst != std::string::npos)) { - thashtags.insert(canonical); - - return hashtags; + canonical = std::string(tc, pst, dst - pst + 1); } - - // Emoticon freevar - if (emoji) - { - emoticons.forms.add(canonical); + + word& w = ([&] () -> word& { + // Hashtag freevar + if (canonical[0] == '#') + { + thashtags.insert(canonical); - return emoticons; - } + return hashtags; + } - if ((pst != std::string::npos) && (dst != std::string::npos)) - { - std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); - if (fv_emoticons.check(emoticon_canon)) + // Emoticon freevar + if (emoji) { - emoticons.forms.add(emoticon_canon); + emoticons.forms.add(canonical); return emoticons; } - } - // Basically any other word - if (canonical_form.count(canonical) == 0) - { - if ( - // Legacy freevars should be distinct from tokens containing similar words - (canonical.find("$name$") != std::string::npos) - // Words with no letters will be mangled by the spell checker - || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) - ) + if ((pst != std::string::npos) && (dst != std::string::npos)) { - canonical_form[canonical] = canonical; - words.emplace(canonical, canonical); - } else { - int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); - if (correct) + std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); + if (fv_emoticons.count(emoticon_canon) == 1) + { + emoticons.forms.add(emoticon_canon); + + return emoticons; + } + } + + // Basically any other word + if (canonical_form.count(canonical) == 0) + { + if ( + // Legacy freevars should be distinct from tokens containing similar words + (canonical.find("$name$") != std::string::npos) + // Words with no letters will be mangled by the spell checker + || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) + ) { - words.emplace(canonical, canonical); canonical_form[canonical] = canonical; + words.emplace(canonical, canonical); } else { - const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); - AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); - const char* replacement = aspell_string_enumeration_next(elements); - if (replacement != NULL) + int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); + if (correct) { - std::string sugrep(replacement); - canonical_form[canonical] = sugrep; - - if (words.count(sugrep) == 0) - { - words.emplace(sugrep, sugrep); - } - } else { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; - } + } else { + const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); + AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); + const char* replacement = aspell_string_enumeration_next(elements); + if (replacement != NULL) + { + std::string sugrep(replacement); + canonical_form[canonical] = sugrep; - delete_aspell_string_enumeration(elements); + if (words.count(sugrep) == 0) + { + words.emplace(sugrep, sugrep); + } + } else { + words.emplace(canonical, canonical); + canonical_form[canonical] = canonical; + } + + delete_aspell_string_enumeration(elements); + } } } - } - word& tw = words.at(canonical_form.at(canonical)); - tw.forms.add(canonical); + word& tw = words.at(canonical_form.at(canonical)); + tw.forms.add(canonical); - return tw; - })(); + return tw; + })(); - token tk(w); - tk.raw = t; + token tk(w); + tk.raw = t; - for (char c : t) - { - if (c == '*') + for (char c : t) { - tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; - } else if (c == '[') - { - tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; - } else if (c == '(') - { - tk.delimiters[{parentype::paren, doublestatus::opening}]++; - } else if (c == '"') - { - tk.delimiters[{parentype::quote, doublestatus::opening}]++; - } else { - break; + if (c == '*') + { + tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; + } else if (c == '[') + { + tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; + } else if (c == '(') + { + tk.delimiters[{parentype::paren, doublestatus::opening}]++; + } else if (c == '"') + { + tk.delimiters[{parentype::quote, doublestatus::opening}]++; + } else { + break; + } } - } - int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; - if (backtrack != t.length()) - { - std::string ending = t.substr(backtrack); - std::string suffix; - - for (char c : ending) + int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; + if (backtrack != t.length()) { - if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + std::string ending = t.substr(backtrack); + std::string suffix; + + for (char c : ending) { - suffix += c; + if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) + { + suffix += c; - continue; - } else if (c == '\n') - { - // At least the end is coming - if (suffix.empty()) + continue; + } else if (c == '\n') { - suffix = "."; - } + // At least the end is coming + if (suffix.empty()) + { + suffix = "."; + } - break; - } + break; + } + + parentype pt = ([&] { + switch (c) + { + case ']': return parentype::square_bracket; + case ')': return parentype::paren; + case '*': return parentype::asterisk; + case '"': return parentype::quote; + } + })(); - parentype pt = ([&] { - switch (c) + if (tk.delimiters[{pt, doublestatus::opening}] > 0) { - case ']': return parentype::square_bracket; - case ')': return parentype::paren; - case '*': return parentype::asterisk; - case '"': return parentype::quote; + tk.delimiters[{pt, doublestatus::opening}]--; + tk.delimiters[{pt, doublestatus::both}]++; + } else { + tk.delimiters[{pt, doublestatus::closing}]++; } - })(); - - if (tk.delimiters[{pt, doublestatus::opening}] > 0) - { - tk.delimiters[{pt, doublestatus::opening}]--; - tk.delimiters[{pt, doublestatus::both}]++; - } else { - tk.delimiters[{pt, doublestatus::closing}]++; } - } - if (suffix == ",") - { - tk.suffix = suffixtype::comma; - } else if (!suffix.empty()) { - tk.suffix = suffixtype::terminating; + if (suffix == ",") + { + tk.suffix = suffixtype::comma; + } else if (!suffix.empty()) { + tk.suffix = suffixtype::terminating; - w.terms.add(suffix); + w.terms.add(suffix); + } } - } - tokens.push_back(tk); - } + tokens.push_back(tk); + } - start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); + start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); + } + + startper += _corpora[i].length(); } std::cout << "\b\b\b\b100%" << std::endl; @@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) kgram klist = it.first; auto& probtable = it.second; - auto& distribution = stats[klist]; + auto& distribution = _stats[klist]; int max = 0; for (auto& kt : probtable) @@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK) } std::cout << "\b\b\b\b100%" << std::endl; + + _compiled = true; } -void printKgram(kgram k) +std::ostream& operator<<(std::ostream& os, rawr::kgram k) { for (auto& q : k) { - if (q.type == querytype::sentence) - { - std::cout << "#.# "; - } else if (q.type == querytype::literal) - { - if (q.tok.suffix == suffixtype::terminating) - { - std::cout << q.tok.w.canon << ". "; - } else if (q.tok.suffix == suffixtype::comma) - { - std::cout << q.tok.w.canon << ", "; - } else { - std::cout << q.tok.w.canon << " "; - } - } + os << q << " "; } + + return os; +} + +std::ostream& operator<<(std::ostream& os, rawr::query q) +{ + if (q.type == rawr::querytype::sentence) + { + return os << "#.#"; + } else if (q.type == rawr::querytype::literal) + { + return os << q.tok; + } + + return os; +} + +std::ostream& operator<<(std::ostream& os, rawr::token t) +{ + os << t.w.canon; + + if (t.suffix == rawr::suffixtype::terminating) + { + return os << "."; + } else if (t.suffix == rawr::suffixtype::comma) + { + return os << ","; + } else { + return os; + } +} + +void rawr::setTransformCallback(transform_callback _arg) +{ + _transform = _arg; } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus -std::string kgramstats::randomSentence(int maxL) +std::string rawr::randomSentence(int maxL) { + if (!_compiled) + { + return ""; + } + std::string result; kgram cur(1, wildcardQuery); int cuts = 0; @@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL) for (;;) { - if (cur.size() == maxK) + if (cur.size() == _maxK) { cur.pop_front(); } if (cur.size() > 0) { - if (rand() % (maxK - cur.size() + 1) == 0) + if (rand() % (_maxK - cur.size() + 1) == 0) { while ((cur.size() > 2) && (cuts > 0)) { @@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL) // Gotta circumvent the last line of the input corpus // https://twitter.com/starla4444/status/684222271339237376 - if (stats.count(cur) == 0) + if (_stats.count(cur) == 0) { cur = kgram(1, wildcardQuery); } - auto& distribution = stats[cur]; + auto& distribution = _stats[cur]; int max = distribution.rbegin()->first; int r = rand() % max; token_data& next = distribution.upper_bound(r)->second; std::string nextToken = next.tok.w.forms.next(); + + // Apply user-specified transforms + if (_transform) + { + nextToken = _transform(next.tok.w.canon, nextToken); + } // Determine the casing of the next token. We randomly make the token all // caps based on the markov chain. Otherwise, we check if the previous @@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL) } /* DEBUG */ - printKgram(cur); - std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; + std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; cur.push_back(next.tok); @@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL) open_delimiters.pop(); } - // Replace old-style freevars while I can't be bothered to remake the corpus yet - std::vector fv_names; - std::ifstream namefile("names.txt"); - if (namefile.is_open()) - { - while (!namefile.eof()) - { - std::string l; - getline(namefile, l); - if (l.back() == '\r') - { - l.pop_back(); - } - - fv_names.push_back(l); - } - - int cpos; - while ((cpos = result.find("$name$")) != std::string::npos) - { - result.replace(cpos, 6, fv_names[rand() % fv_names.size()]); - } - } + result.resize(maxL); return result; } diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -1,124 +1,135 @@ +#ifndef KGRAMSTATS_H +#define KGRAMSTATS_H + #include #include #include #include #include "histogram.h" +#include -#ifndef KGRAMSTATS_H -#define KGRAMSTATS_H - -struct word { - std::string canon; - histogram forms; - histogram terms; +class rawr { + public: + typedef std::function transform_callback; + + void addCorpus(std::string corpus); + void compile(int maxK); + + void setTransformCallback(transform_callback _arg); + std::string randomSentence(int maxL); + + private: + struct word { + std::string canon; + histogram forms; + histogram terms; - word(std::string canon) : canon(canon) {} + word(std::string canon) : canon(canon) {} - bool operator<(const word& other) const - { - return canon < other.canon; - } -}; - -extern word blank_word; + bool operator<(const word& other) const + { + return canon < other.canon; + } + }; -enum class suffixtype { - none, - terminating, - comma -}; + enum class suffixtype { + none, + terminating, + comma + }; -enum class parentype { - paren, - square_bracket, - asterisk, - quote -}; + enum class parentype { + paren, + square_bracket, + asterisk, + quote + }; -enum class doublestatus { - opening, - closing, - both -}; + enum class doublestatus { + opening, + closing, + both + }; -struct delimiter { - parentype type; - doublestatus status; + struct delimiter { + parentype type; + doublestatus status; - delimiter(parentype type, doublestatus status) : type(type), status(status) {} + delimiter(parentype type, doublestatus status) : type(type), status(status) {} - bool operator<(const delimiter& other) const - { - return std::tie(type, status) < std::tie(other.type, other.status); - } -}; + bool operator<(const delimiter& other) const + { + return std::tie(type, status) < std::tie(other.type, other.status); + } + }; -struct token { - const word& w; - std::map delimiters; - suffixtype suffix; - std::string raw; + struct token { + const word& w; + std::map delimiters; + suffixtype suffix; + std::string raw; - token(const word& w) : w(w), suffix(suffixtype::none) {} + token(const word& w) : w(w), suffix(suffixtype::none) {} - bool operator<(const token& other) const - { - return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); - } -}; + bool operator<(const token& other) const + { + return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); + } + }; -enum class querytype { - literal, - sentence -}; + enum class querytype { + literal, + sentence + }; -struct query { - querytype type; - token tok; + struct query { + querytype type; + token tok; - query(token tok) : tok(tok), type(querytype::literal) {} + query(token tok) : tok(tok), type(querytype::literal) {} - query(querytype type) : tok(blank_word), type(type) {} + query(querytype type) : tok(blank_word), type(type) {} - bool operator<(const query& other) const - { - if (type == other.type) - { - return tok < other.tok; - } else { - return type < other.type; - } - } -}; - -typedef std::list kgram; + bool operator<(const query& other) const + { + if (type == other.type) + { + return tok < other.tok; + } else { + return type < other.type; + } + } + }; + + static const query wildcardQuery; + static const word blank_word; -class kgramstats -{ -public: - kgramstats(std::string corpus, int maxK); - std::string randomSentence(int maxL); - -private: - struct token_data - { - int all; - int titlecase; - int uppercase; - token tok; + typedef std::list kgram; - token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} - }; + struct token_data + { + int all; + int titlecase; + int uppercase; + token tok; + + token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} + }; + + friend std::ostream& operator<<(std::ostream& os, kgram k); + friend std::ostream& operator<<(std::ostream& os, query q); + friend std::ostream& operator<<(std::ostream& os, token t); - int maxK; - std::map > stats; + int _maxK; + bool _compiled = false; + std::vector _corpora; + std::map> _stats; + transform_callback _transform; - // Words - std::map words; - word hashtags {"#hashtag"}; - word emoticons {"👌"}; + // Words + std::map words; + word hashtags {"#hashtag"}; + word emoticons {"👌"}; }; -void printKgram(kgram k); - #endif \ No newline at end of file diff --git a/rawr.h b/rawr.h new file mode 100644 index 0000000..2b5daf7 --- /dev/null +++ b/rawr.h @@ -0,0 +1,6 @@ +#ifndef RAWR_H_E903544C +#define RAWR_H_E903544C + +#include "kgramstats.h" + +#endif /* end of include guard: RAWR_H_E903544C */ -- cgit 1.4.1