// #include "kgramstats.h" #include #include #include #include #include #include "prefix_search.h" #include #include const rawr::query rawr::wildcardQuery = {querytype::sentence}; const rawr::word rawr::blank_word = {""}; void rawr::addCorpus(std::string corpus) { _corpora.push_back(corpus); } // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant void rawr::compile(int maxK) { _maxK = maxK; std::vector> tokens; std::set thashtags; std::set fv_emoticons; std::ifstream fvefile("emoticons.txt"); if (fvefile) { std::string line; while (getline(fvefile, line)) { fv_emoticons.insert(line); emoticons.forms.add(line); } } fvefile.close(); std::map canonical_form; AspellConfig* spell_config = new_aspell_config(); AspellCanHaveError* possible_err = new_aspell_speller(spell_config); if (aspell_error_number(possible_err) != 0) { std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl; exit(1); } AspellSpeller* spell_checker = to_aspell_speller(possible_err); std::cout << "Reading emojis..." << std::endl; prefix_search emojis; std::ifstream emoji_file("emojis.txt"); if (emoji_file) { while (!emoji_file.eof()) { std::string rawmojis; getline(emoji_file, rawmojis); if (rawmojis.back() == '\r') { rawmojis.pop_back(); } emojis.add(rawmojis); } emoji_file.close(); } std::cout << "Tokenizing corpus... 0%" << std::flush; int len = 0; for (auto c : _corpora) { len += c.length(); } int startper = 0; int per = 0; int perprime = 0; std::cout.fill(' '); for (int i = 0; i < _corpora.size(); i++) { size_t start = 0; int end = 0; std::vector tkcor; while (end != std::string::npos) { perprime = (startper + end) * 100 / len; if (perprime != per) { per = perprime; std::cout << "\b\b\b\b" << std::right; std::cout.width(3); std::cout << per << "%" << std::flush; } end = _corpora[i].find_first_of(" \n", start); bool emoji = false; std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start + 1); std::string t = ""; if (te.compare("") && te.compare(".") && te.compare(" ")) { if (te.back() == ' ') { te.pop_back(); } // Extract strings of emojis into their own tokens even if they're not space delimited int m = emojis.match(te); emoji = m > 0; if (m == 0) m = 1; t = te.substr(0,m); te = te.substr(m); while (!te.empty()) { m = emojis.match(te); if (emoji == (m > 0)) { if (m == 0) m = 1; t += te.substr(0,m); te = te.substr(m); } else { end = start + t.length() - 1; break; } } std::string tc(t); std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); int pst = tc.find_first_not_of("\"([*"); int dst = tc.find_last_not_of("\")]*.,?!\n;:"); std::string canonical(""); if ((pst != std::string::npos) && (dst != std::string::npos)) { canonical = std::string(tc, pst, dst - pst + 1); } word& w = ([&] () -> word& { // Hashtag freevar if (canonical[0] == '#') { thashtags.insert(canonical); return hashtags; } // Emoticon freevar if (emoji) { emoticons.forms.add(canonical); return emoticons; } if ((pst != std::string::npos) && (dst != std::string::npos)) { std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); if (fv_emoticons.count(emoticon_canon) == 1) { emoticons.forms.add(emoticon_canon); return emoticons; } } // Basically any other word if (canonical_form.count(canonical) == 0) { if ( // Legacy freevars should be distinct from tokens containing similar words (canonical.find("$name$") != std::string::npos) // Words with no letters will be mangled by the spell checker || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) ) { canonical_form[canonical] = canonical; words.emplace(canonical, canonical); } else { int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); if (correct) { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; } else { const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); const char* replacement = aspell_string_enumeration_next(elements); if (replacement != NULL) { std::string sugrep(replacement); canonical_form[canonical] = sugrep; if (words.count(sugrep) == 0) { words.emplace(sugrep, sugrep); } } else { words.emplace(canonical, canonical); canonical_form[canonical] = canonical; } delete_aspell_string_enumeration(elements); } } } word& tw = words.at(canonical_form.at(canonical)); tw.forms.add(canonical); return tw; })(); token tk(w); tk.raw = t; for (char c : t) { if (c == '*') { tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; } else if (c == '[') { tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; } else if (c == '(') { tk.delimiters[{parentype::paren, doublestatus::opening}]++; } else if (c == '"') { tk.delimiters[{parentype::quote, doublestatus::opening}]++; } else { break; } } int backtrack = t.find_last_not_of(".,?!])\"*\n;:") + 1; if (backtrack != t.length()) { std::string ending = t.substr(backtrack); std::string suffix; bool newline = false; bool terminating = false; for (char c : ending) { if ((c == '.') || (c == ',') || (c == '?') || (c == '!') || (c == ';') || (c == ':')) { suffix += c; terminating = true; continue; } else if (c == '\n') { newline = true; terminating = true; continue; } parentype pt = ([&] { switch (c) { case ']': return parentype::square_bracket; case ')': return parentype::paren; case '*': return parentype::asterisk; case '"': return parentype::quote; } })(); if (tk.delimiters[{pt, doublestatus::opening}] > 0) { tk.delimiters[{pt, doublestatus::opening}]--; tk.delimiters[{pt, doublestatus::both}]++; } else { tk.delimiters[{pt, doublestatus::closing}]++; } } if (terminating) { if ((suffix == ",") && (!newline)) { tk.suffix = suffixtype::comma; } else { tk.suffix = suffixtype::terminating; if (!newline) { w.terms.add({suffix, false}); } else { w.terms.add({".", false}); } } } } tkcor.push_back(_tokenstore.add(tk)); } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } tokens.push_back(tkcor); startper += _corpora[i].length(); } std::cout << "\b\b\b\b100%" << std::endl; delete_aspell_speller(spell_checker); delete_aspell_config(spell_config); std::cout << canonical_form.size() << " distinct forms" << std::endl; std::cout << words.size() << " distinct words" << std::endl; // Time to condense the distribution stuff for the words std::cout << "Compiling token histograms..." << std::endl; for (auto& it : words) { it.second.forms.compile(); it.second.terms.compile(); } // Hashtag freevar is not frequency distributed for (auto& it : thashtags) { hashtags.forms.add(it); } hashtags.forms.compile(); hashtags.terms.compile(); // Compile other freevars emoticons.forms.compile(); emoticons.terms.compile(); // Compile the interned tokens. _tokenstore.compile(); // kgram distribution std::cout << "Creating markov chain... 0%" << std::flush; std::map > tstats; len = 0; for (auto c : tokens) { len += (maxK-1) * c.size(); } startper = 0; per = 0; perprime = 0; int corpid = 0; for (auto corpus : tokens) { for (int k=0; ktok); if (startTok.suffix == suffixtype::terminating) { kgram term_prefix(prefix); term_prefix.pop_front(); term_prefix.push_front(wildcardQuery); if (tstats[term_prefix].count(fid) == 0) { tstats[term_prefix].emplace(fid, fid); } token_data& td2 = tstats[term_prefix].at(fid); td2.all++; td2.corpora.insert(corpid); if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { td2.uppercase++; } else if (isupper(f.raw[0])) { td2.titlecase++; } } } startper += corpus.size(); } corpid++; } std::cout << "\b\b\b\b100%" << std::endl; // Condense the kgram distribution std::cout << "Compiling kgram distributions... 0%"; len = tstats.size(); per = 0; perprime = 0; int indicator = 0; for (auto& it : tstats) { indicator++; perprime = indicator * 100 / len; if (per != perprime) { per = perprime; std::cout << "\b\b\b\b" << std::right; std::cout.width(3); std::cout << per << "%" << std::flush; } kgram klist = it.first; auto& probtable = it.second; auto& distribution = _stats[klist]; int max = 0; for (auto& kt : probtable) { max += kt.second.all; distribution.emplace(max, kt.second); } } std::cout << "\b\b\b\b100%" << std::endl; _compiled = true; } std::ostream& operator<<(std::ostream& os, rawr::kgram k) { for (auto& q : k) { os << q << " "; } return os; } std::ostream& operator<<(std::ostream& os, rawr::query q) { if (q.type == rawr::querytype::sentence) { return os << "#.#"; } else if (q.type == rawr::querytype::literal) { return os << q.tok; } return os; } std::ostream& operator<<(std::ostream& os, rawr::token t) { os << t.w.canon; if (t.suffix == rawr::suffixtype::terminating) { return os << "."; } else if (t.suffix == rawr::suffixtype::comma) { return os << ","; } else { return os; } } std::ostream& operator<<(std::ostream& os, rawr::terminator t) { os << t.form; if (t.newline) { os << "↵"; } return os; } void rawr::setTransformCallback(transform_callback _arg) { _transform = _arg; } void rawr::setMinCorpora(int _arg) { _min_corpora = _arg; } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus std::string rawr::randomSentence(int maxL, std::mt19937& rng) const { if (!_compiled) { return ""; } std::string result; kgram cur(1, wildcardQuery); int cuts = 0; std::stack open_delimiters; std::set used_corpora; for (;;) { if (cur.size() == _maxK) { cur.pop_front(); } while (cur.size() > 2 && cuts > 0 && !std::bernoulli_distribution(1.0 / static_cast(cuts))(rng)) { cur.pop_front(); cuts--; } // Gotta circumvent the last line of the input corpus // https://twitter.com/starla4444/status/684222271339237376 if (_stats.count(cur) == 0) { // The end of a corpus should probably be treated like a terminator, so // maybe we should just end here. if (result.length() > maxL || std::bernoulli_distribution(1.0 / 4.0)(rng)) { break; } cur = kgram(1, wildcardQuery); } auto& distribution = _stats.at(cur); int max = distribution.rbegin()->first; std::uniform_int_distribution randDist(0, max - 1); int r = randDist(rng); const token_data& next = distribution.upper_bound(r)->second; const token& interned = _tokenstore.get(next.tok); std::string nextToken = interned.w.forms.next(rng); // Apply user-specified transforms if (_transform) { nextToken = _transform(interned.w.canon, nextToken); } // Determine the casing of the next token. We randomly make the token all // caps based on the markov chain. Otherwise, we check if the previous // token is the end of a sentence (terminating token or a wildcard query). std::uniform_int_distribution caseDist(0, next.all - 1); int casing = caseDist(rng); if (casing < next.uppercase) { std::transform( std::begin(nextToken), std::end(nextToken), std::begin(nextToken), ::toupper); } else { bool capitalize = false; if (casing - next.uppercase < next.titlecase) { capitalize = true; } else if (cur.rbegin()->type == querytype::sentence) { if (std::bernoulli_distribution(1.0 / 2.0)(rng)) { capitalize = true; } } else { const token& lastTok = _tokenstore.get(cur.rbegin()->tok); if (lastTok.suffix == suffixtype::terminating && std::bernoulli_distribution(1.0 / 2.0)(rng)) { capitalize = true; } } if (capitalize) { nextToken[0] = toupper(nextToken[0]); } } // Delimiters for (auto& dt : interned.delimiters) { if (dt.first.status == doublestatus::both) { switch (dt.first.type) { case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; } } else if (dt.first.status == doublestatus::opening) { for (int i=0; i 0) { // Otherwise, decrease cut chance cuts /= 2; } if (next.corpora.size() == 1) { used_corpora.insert(*next.corpora.begin()); } /* DEBUG */ std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << " in corp"; for (auto cor : next.corpora) { std::cout << " " << cor; } std::cout << "; l=" << cur.size() << ",cuts=" << cuts << std::endl; cur.push_back(next.tok); result.append(nextToken); if (interned.suffix == suffixtype::terminating && (result.length() > maxL || std::bernoulli_distribution(1.0 / 4.0)(rng))) { break; } } // Ensure that enough corpora are used if (used_corpora.size() < _min_corpora) { return randomSentence(maxL, rng); } // Remove the trailing space if (result.back() == ' ' || result.back() == '\n') { result.pop_back(); } // Close any open delimiters while (!open_delimiters.empty()) { switch (open_delimiters.top()) { case parentype::paren: result.append(")"); break; case parentype::square_bracket: result.append("]"); break; case parentype::asterisk: result.append("*"); break; case parentype::quote: result.append("\""); break; } open_delimiters.pop(); } return result; }