// _____ _ _ _____ _ _ _ _ _ _ // |_ _( ) | | |_ _| (_) | | | (_) | | | | // | | |/ _ __ ___ __ _ _ __ ___ _ _ _ __ __| | ___ _ __ ___ _ __ | | _ _ _ ___| |_ | |_ ___| |_ ___ _ __ ___ __| | // | | | '_ ` _ \ / _` | | '_ ` _ \| | | | '__/ _` |/ _ \ '__/ _ \ '__| | | | | | | / __| __| | | / __| __/ _ \ '_ \ / _ \/ _` | // _| |_ | | | | | | | (_| | | | | | | | |_| | | | (_| | __/ | | __/ |_ _| |_ | | |_| \__ \ |_ | | \__ \ || __/ | | | __/ (_| | // |_____| |_| |_| |_| \__,_| |_| |_| |_|\__,_|_| \__,_|\___|_| \___|_(_) |_____| | |\__,_|___/\__| |_|_|___/\__\___|_| |_|\___|\__,_| // _ _ __ _ _ _ _/ | __ _ _ // | | | |/ _| (_) | | (_) |__/ / _| | | | | // | |_ ___ _ __ ___ _ _ ___ ___| | |_ _ _ _ __ _ _ __ __ _| |_ _ _ __ __ _ | |_ ___ _ __ | |_| |__ _ __ ___ ___ // | __/ _ \ | '_ ` _ \| | | / __|/ _ \ | _| | | | | '__| | '_ \ / _` | __| | '_ \ / _` | | _/ _ \| '__| | __| '_ \| '__/ _ \/ _ \ // | || (_) | | | | | | | |_| \__ \ __/ | | | |_| | | | | | | | (_| | |_| | | | | (_| | | || (_) | | | |_| | | | | | __/ __/ // \__\___/ |_| |_| |_|\__, |___/\___|_|_| \__,_|_| |_|_| |_|\__,_|\__|_|_| |_|\__, | |_| \___/|_| \__|_| |_|_| \___|\___| // _ __/ | ______ _ _ _ _ _ _ __/ | _ _ // | | |___/ | ____| | | | | | | (_) (_| ) |___/ (_) | | // | |__ ___ _ __ _ _ ___ | |__ _ _ ___| | __ | |_| |__ _ ___ _|/ _ __ ___ __ _ ___ _ _ __ __ _ | |_ ___ // | '_ \ / _ \| '__| | | / __| | __| | | |/ __| |/ / | __| '_ \| / __| | | | '_ ` _ \ / _` |/ _ \| | '_ \ / _` | | __/ _ \ // | | | | (_) | | | |_| \__ \_ | | | |_| | (__| < | |_| | | | \__ \_ | | | | | | | | | (_| | (_) | | | | | (_| | | || (_) | // |_| |_|\___/|_| \__,_|___(_) |_| \__,_|\___|_|\_\ \__|_| |_|_|___(_) |_| |_| |_| |_| \__, |\___/|_|_| |_|\__, | \__\___/ // __/ | __/ | // _ _ _ _ |___/ |___/ // | | (_) | | | | | // _ __ ___ _ __ | | __ _ ___ ___ ___ __ _ __ _ __ ___| |_| |__ ___ __ _ __ _ _ __ ___ _ __ | | __ _ ___ ___ _ __ // | '__/ _ \ '_ \| |/ _` |/ __/ _ \ / _ \/ _` |/ _` | \ \ /\ / / | __| '_ \ / _ \/ _` |/ _` | | '__/ _ \ '_ \| |/ _` |/ __/ _ \ '__| // | | | __/ |_) | | (_| | (_| __/ | __/ (_| | (_| | \ V V /| | |_| | | | | __/ (_| | (_| | | | | __/ |_) | | (_| | (_| __/ | // |_| \___| .__/|_|\__,_|\___\___| \___|\__, |\__, | \_/\_/ |_|\__|_| |_| \___|\__, |\__, | |_| \___| .__/|_|\__,_|\___\___|_| // | | _ _ _ _ __/ | __/ | _ _ _ __/ | __/ | | | // |_| | | (_) | | |___/ |___(_) | | | | | |___/ |___/ |_| // __ _ _ __ __| | _ __ ___ _| | | __ __ ___| |_| |__ __ ____ _| |_ // / _` | '_ \ / _` | | '_ ` _ \| | | |/ / \ \ /\ / / | __| '_ \ \ \ /\ / / _` | __| // | (_| | | | | (_| | | | | | | | | | < \ V V /| | |_| | | | \ V V / (_| | |_ // \__,_|_| |_|\__,_| |_| |_| |_|_|_|_|\_\ \_/\_/ |_|\__|_| |_| \_/\_/ \__,_|\__| // // #include "kgramstats.h" #include #include #include #include #include #include #include "freevars.h" #include #include "prefix_search.h" query wildcardQuery {querytype::sentence}; word blank_word {""}; // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant kgramstats::kgramstats(std::string corpus, int maxK) { this->maxK = maxK; std::vector tokens; size_t start = 0; int end = 0; std::set thashtags; freevar fv_emoticons {emoticons, "emoticons.txt"}; std::cout << "Reading emojis..." << std::endl; prefix_search emojis; std::ifstream emoji_file("emojis.txt"); if (emoji_file) { while (!emoji_file.eof()) { std::string rawmojis; getline(emoji_file, rawmojis); emojis.add(rawmojis); } emoji_file.close(); } std::cout << "Tokenizing corpus..." << std::endl; while (end != std::string::npos) { end = corpus.find(" ", start); bool emoji = false; std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); std::string t = ""; if (te.compare("") && te.compare(".")) { // Extract strings of emojis into their own tokens even if they're not space delimited int m = emojis.match(te); emoji = m > 0; if (m == 0) m = 1; t = te.substr(0,m); te = te.substr(m); while (!te.empty()) { m = emojis.match(te); if (emoji == (m > 0)) { if (m == 0) m = 1; t += te.substr(0,m); te = te.substr(m); } else { end = start + t.length() - 1; break; } } std::string tc(t), canonical; std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); }); word& w = ([&] () -> word& { // Hashtag freevar if (canonical[0] == '#') { thashtags.insert(canonical); return hashtags; } // Emoticon freevar if (emoji) { emoticons.forms.add(canonical); return emoticons; } std::string emoticon_canon; std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); }); if (fv_emoticons.check(emoticon_canon)) { emoticons.forms.add(emoticon_canon); return emoticons; } // Basically any other word if (words.count(canonical) == 0) { words.emplace(canonical, canonical); } word& tw = words.at(canonical); tw.forms.add(canonical); return tw; })(); token tk(w); tk.raw = t; for (char c : t) { if (c == '*') { tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; } else if (c == '[') { tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; } else if (c == '(') { tk.delimiters[{parentype::paren, doublestatus::opening}]++; } else if (c == '"') { tk.delimiters[{parentype::quote, doublestatus::opening}]++; } else { break; } } int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; if (backtrack != t.length()) { std::string ending = t.substr(backtrack); std::string suffix; for (char c : ending) { if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) { suffix += c; continue; } else if (c == '\n') { // At least the end is coming if (suffix.empty()) { suffix = "."; } break; } parentype pt = ([&] { switch (c) { case ']': return parentype::square_bracket; case ')': return parentype::paren; case '*': return parentype::asterisk; case '"': return parentype::quote; } })(); if (tk.delimiters[{pt, doublestatus::opening}] > 0) { tk.delimiters[{pt, doublestatus::opening}]--; tk.delimiters[{pt, doublestatus::both}]++; } else { tk.delimiters[{pt, doublestatus::closing}]++; } } if (suffix == ",") { tk.suffix = suffixtype::comma; } else if (!suffix.empty()) { tk.suffix = suffixtype::terminating; w.terms.add(suffix); } } tokens.push_back(tk); } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } // Time to condense the distribution stuff for the words std::cout << "Compiling token histograms..." << std::endl; for (auto& it : words) { it.second.forms.compile(); it.second.terms.compile(); } // Hashtag freevar is not frequency distributed for (auto& it : thashtags) { hashtags.forms.add(it); } hashtags.forms.compile(); hashtags.terms.compile(); // Compile other freevars emoticons.forms.compile(); emoticons.terms.compile(); // kgram distribution std::cout << "Creating markov chain..." << std::endl; std::map > tstats; for (int k=1; k open_delimiters; for (int i=0; i 0) { if (rand() % (maxK - cur.size() + 1) == 0) { while (cur.size() > 2) { if ((rand() % (n)) < cuts) { cur.pop_front(); cuts--; } else { break; } } } cuts++; } // Gotta circumvent the last line of the input corpus // https://twitter.com/starla4444/status/684222271339237376 if (stats.count(cur) == 0) { cur = kgram(1, wildcardQuery); } auto& distribution = stats[cur]; int max = distribution.rbegin()->first; int r = rand() % max; token_data& next = distribution.upper_bound(r)->second; std::string nextToken = next.tok.w.forms.next(); // Determine the casing of the next token. We randomly make the token all // caps based on the markov chain. Otherwise, we check if the previous // token is the end of a sentence (terminating token or a wildcard query). int casing = rand() % next.all; if (casing < next.uppercase) { std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); } else if ((((cur.rbegin()->type == querytype::sentence) || ((cur.rbegin()->type == querytype::literal) && (cur.rbegin()->tok.suffix == suffixtype::terminating))) && (rand() % 2 > 0)) || (casing - next.uppercase < next.titlecase)) { nextToken[0] = toupper(nextToken[0]); } // Delimiters for (auto& dt : next.tok.delimiters) { if (dt.first.status == doublestatus::both) { switch (dt.first.type) { case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; } } else if (dt.first.status == doublestatus::opening) { for (int i=0; i \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; cur.push_back(next.tok); result.append(nextToken + " "); if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) { break; } } // Remove the trailing space if (result.back() == ' ') { result.pop_back(); } // Close any open delimiters while (!open_delimiters.empty()) { switch (open_delimiters.top()) { case parentype::paren: result.append(")"); break; case parentype::square_bracket: result.append("]"); break; case parentype::asterisk: result.append("*"); break; case parentype::quote: result.append("\""); break; } open_delimiters.pop(); } // Replace old-style freevars while I can't be bothered to remake the corpus yet std::vector fv_names; std::ifstream namefile("names.txt"); while (!namefile.eof()) { std::string l; getline(namefile, l); fv_names.push_back(l); } int cpos; while ((cpos = result.find("$name$")) != std::string::npos) { result.replace(cpos, 6, fv_names[rand() % fv_names.size()]); } std::vector fv_nouns; std::ifstream nounfile("nouns.txt"); while (!nounfile.eof()) { std::string l; getline(nounfile, l); fv_nouns.push_back(l); } while ((cpos = result.find("$noun$")) != std::string::npos) { result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]); } return result; }