diff options
| -rw-r--r-- | CMakeLists.txt | 8 | ||||
| -rw-r--r-- | ebooks.cpp | 15 | ||||
| -rw-r--r-- | freevars.cpp | 4 | ||||
| -rw-r--r-- | gen.cpp | 15 | ||||
| -rw-r--r-- | histogram.cpp | 34 | ||||
| -rw-r--r-- | histogram.h | 19 | ||||
| -rw-r--r-- | kgramstats.cpp | 453 | ||||
| -rw-r--r-- | kgramstats.h | 124 |
8 files changed, 406 insertions, 266 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
| @@ -8,10 +8,14 @@ find_package(curl) | |||
| 8 | if (YamlCpp_FOUND AND CURL_FOUND) | 8 | if (YamlCpp_FOUND AND CURL_FOUND) |
| 9 | add_subdirectory(vendor/twitcurl/libtwitcurl) | 9 | add_subdirectory(vendor/twitcurl/libtwitcurl) |
| 10 | include_directories(vendor/twitcurl/libtwitcurl) | 10 | include_directories(vendor/twitcurl/libtwitcurl) |
| 11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
| 12 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) | ||
| 13 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) | ||
| 12 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) | 14 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) |
| 13 | else (YamlCpp_FOUND AND CURL_FOUND) | 15 | else (YamlCpp_FOUND AND CURL_FOUND) |
| 14 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") | 16 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") |
| 15 | endif (YamlCpp_FOUND AND CURL_FOUND) | 17 | endif (YamlCpp_FOUND AND CURL_FOUND) |
| 16 | 18 | ||
| 17 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 19 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
| 20 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) | ||
| 21 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) | ||
| diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -44,20 +44,9 @@ int main(int argc, char** args) | |||
| 44 | std::cout << "Generating..." << std::endl; | 44 | std::cout << "Generating..." << std::endl; |
| 45 | for (;;) | 45 | for (;;) |
| 46 | { | 46 | { |
| 47 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); | 47 | std::string doc = stats->randomSentence(rand() % 45 + 5); |
| 48 | std::string hi; | 48 | std::string hi = vars->parse(doc); |
| 49 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
| 50 | { | ||
| 51 | hi += vars->parse(*it) + " "; | ||
| 52 | } | ||
| 53 | |||
| 54 | hi.resize(140); | 49 | hi.resize(140); |
| 55 | |||
| 56 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
| 57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
| 58 | { | ||
| 59 | hi = hi.substr(0, lastperiod+1); | ||
| 60 | } | ||
| 61 | 50 | ||
| 62 | std::string replyMsg; | 51 | std::string replyMsg; |
| 63 | if (twitter.statusUpdate(hi)) | 52 | if (twitter.statusUpdate(hi)) |
| diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp | |||
| @@ -34,8 +34,8 @@ std::string freevars::parse(std::string in) | |||
| 34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) | 34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) |
| 35 | { | 35 | { |
| 36 | std::string tofind = "$" + it->first + "$"; | 36 | std::string tofind = "$" + it->first + "$"; |
| 37 | size_t fpos = res.find(tofind); | 37 | size_t fpos; |
| 38 | if (fpos != std::string::npos) | 38 | while ((fpos = res.find(tofind)) != std::string::npos) |
| 39 | { | 39 | { |
| 40 | int r = rand() % it->second->size(); | 40 | int r = rand() % it->second->size(); |
| 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); | 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); |
| diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -52,21 +52,10 @@ int main(int argc, char** args) | |||
| 52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
| 53 | for (;;) | 53 | for (;;) |
| 54 | { | 54 | { |
| 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::string doc = stats->randomSentence(rand() % 35 + 15); |
| 56 | std::string hi; | 56 | std::string hi = vars->parse(doc); |
| 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
| 58 | { | ||
| 59 | hi += vars->parse(*it) + " "; | ||
| 60 | } | ||
| 61 | |||
| 62 | hi.resize(140); | 57 | hi.resize(140); |
| 63 | 58 | ||
| 64 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
| 65 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
| 66 | { | ||
| 67 | hi = hi.substr(0, lastperiod+1); | ||
| 68 | } | ||
| 69 | |||
| 70 | std::cout << hi << std::endl; | 59 | std::cout << hi << std::endl; |
| 71 | 60 | ||
| 72 | getc(stdin); | 61 | getc(stdin); |
| diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp | |||
| @@ -0,0 +1,34 @@ | |||
| 1 | #include "histogram.h" | ||
| 2 | #include <cstdlib> | ||
| 3 | |||
| 4 | template <class T> | ||
| 5 | void histogram<T>::add(const T& inst) | ||
| 6 | { | ||
| 7 | freqtable[inst]++; | ||
| 8 | } | ||
| 9 | |||
| 10 | template <class T> | ||
| 11 | void histogram<T>::compile() | ||
| 12 | { | ||
| 13 | distribution.clear(); | ||
| 14 | |||
| 15 | int max = 0; | ||
| 16 | for (auto& it : freqtable) | ||
| 17 | { | ||
| 18 | max += it.second; | ||
| 19 | distribution.emplace(max, it.first); | ||
| 20 | } | ||
| 21 | |||
| 22 | freqtable.clear(); | ||
| 23 | } | ||
| 24 | |||
| 25 | template <class T> | ||
| 26 | const T& histogram<T>::next() const | ||
| 27 | { | ||
| 28 | int max = distribution.rbegin()->first; | ||
| 29 | int r = rand() % max; | ||
| 30 | |||
| 31 | return distribution.upper_bound(r)->second; | ||
| 32 | } | ||
| 33 | |||
| 34 | template class histogram <std::string>; | ||
| diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h | |||
| @@ -0,0 +1,19 @@ | |||
| 1 | #ifndef HISTOGRAM_H_24094D97 | ||
| 2 | #define HISTOGRAM_H_24094D97 | ||
| 3 | |||
| 4 | #include <map> | ||
| 5 | #include <string> | ||
| 6 | |||
| 7 | template <class T> | ||
| 8 | class histogram { | ||
| 9 | public: | ||
| 10 | void add(const T& inst); | ||
| 11 | void compile(); | ||
| 12 | const T& next() const; | ||
| 13 | |||
| 14 | private: | ||
| 15 | std::map<T, int> freqtable; | ||
| 16 | std::map<int, T> distribution; | ||
| 17 | }; | ||
| 18 | |||
| 19 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | ||
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -37,35 +37,11 @@ | |||
| 37 | #include <iostream> | 37 | #include <iostream> |
| 38 | #include <cstdlib> | 38 | #include <cstdlib> |
| 39 | #include <algorithm> | 39 | #include <algorithm> |
| 40 | #include "malaprop.h" | 40 | #include <set> |
| 41 | #include <stack> | ||
| 41 | 42 | ||
| 42 | query wildcardQuery(querytype_sentence); | 43 | query wildcardQuery {querytype::sentence}; |
| 43 | 44 | word blank_word {""}; | |
| 44 | std::string canonize(std::string f); | ||
| 45 | |||
| 46 | token token_from_string(std::string in) | ||
| 47 | { | ||
| 48 | if (in[0] == '#') | ||
| 49 | { | ||
| 50 | token word(tokentype_hashtag); | ||
| 51 | |||
| 52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
| 53 | { | ||
| 54 | word.terminating = true; | ||
| 55 | } | ||
| 56 | |||
| 57 | return word; | ||
| 58 | } else { | ||
| 59 | token word(canonize(in)); | ||
| 60 | |||
| 61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
| 62 | { | ||
| 63 | word.terminating = true; | ||
| 64 | } | ||
| 65 | |||
| 66 | return word; | ||
| 67 | } | ||
| 68 | } | ||
| 69 | 45 | ||
| 70 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 46 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
| 71 | // We consider maxK to be fairly constant | 47 | // We consider maxK to be fairly constant |
| @@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 73 | { | 49 | { |
| 74 | this->maxK = maxK; | 50 | this->maxK = maxK; |
| 75 | 51 | ||
| 76 | std::vector<std::string> tokens; | 52 | std::vector<token> tokens; |
| 77 | size_t start = 0; | 53 | size_t start = 0; |
| 78 | int end = 0; | 54 | int end = 0; |
| 79 | std::set<std::string> thashtags; | 55 | std::set<std::string> thashtags; |
| @@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 82 | { | 58 | { |
| 83 | end = corpus.find(" ", start); | 59 | end = corpus.find(" ", start); |
| 84 | 60 | ||
| 85 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
| 86 | if (token[token.length()-1] == '\n') | 62 | if (t.compare("") && t.compare(".")) |
| 87 | { | 63 | { |
| 88 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) | 64 | std::string tc(t), canonical; |
| 65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | ||
| 66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | ||
| 67 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
| 68 | }); | ||
| 69 | |||
| 70 | word& w = ([&] () -> word& { | ||
| 71 | // Hashtag freevar | ||
| 72 | if (canonical[0] == '#') | ||
| 73 | { | ||
| 74 | thashtags.insert(canonical); | ||
| 75 | canonical = "#hashtag"; | ||
| 76 | |||
| 77 | return hashtags; | ||
| 78 | } | ||
| 79 | |||
| 80 | // Basically any other word | ||
| 81 | if (words.count(canonical) == 0) | ||
| 82 | { | ||
| 83 | words.emplace(canonical, canonical); | ||
| 84 | } | ||
| 85 | |||
| 86 | word& tw = words.at(canonical); | ||
| 87 | tw.forms.add(canonical); | ||
| 88 | |||
| 89 | return tw; | ||
| 90 | })(); | ||
| 91 | |||
| 92 | token tk(w); | ||
| 93 | tk.raw = t; | ||
| 94 | |||
| 95 | for (char c : t) | ||
| 89 | { | 96 | { |
| 90 | token.insert(token.length()-1, "."); | 97 | if (c == '*') |
| 98 | { | ||
| 99 | tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; | ||
| 100 | } else if (c == '[') | ||
| 101 | { | ||
| 102 | tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; | ||
| 103 | } else if (c == '(') | ||
| 104 | { | ||
| 105 | tk.delimiters[{parentype::paren, doublestatus::opening}]++; | ||
| 106 | } else if (c == '"') | ||
| 107 | { | ||
| 108 | tk.delimiters[{parentype::quote, doublestatus::opening}]++; | ||
| 109 | } else { | ||
| 110 | break; | ||
| 111 | } | ||
| 91 | } | 112 | } |
| 92 | |||
| 93 | token.resize(token.length()-1); | ||
| 94 | } | ||
| 95 | |||
| 96 | if (token.compare("") && token.compare(".")) | ||
| 97 | { | ||
| 98 | mstats.addWord(token); | ||
| 99 | tokens.push_back(token); | ||
| 100 | 113 | ||
| 101 | if (token[0] == '#') | 114 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; |
| 115 | if (backtrack != t.length()) | ||
| 102 | { | 116 | { |
| 103 | thashtags.insert(canonize(token)); | 117 | std::string ending = t.substr(backtrack); |
| 118 | std::string suffix; | ||
| 119 | |||
| 120 | for (char c : ending) | ||
| 121 | { | ||
| 122 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | ||
| 123 | { | ||
| 124 | suffix += c; | ||
| 125 | |||
| 126 | continue; | ||
| 127 | } else if (c == '\n') | ||
| 128 | { | ||
| 129 | // At least the end is coming | ||
| 130 | if (suffix.empty()) | ||
| 131 | { | ||
| 132 | suffix = "."; | ||
| 133 | } | ||
| 134 | |||
| 135 | break; | ||
| 136 | } | ||
| 137 | |||
| 138 | parentype pt = ([&] { | ||
| 139 | switch (c) | ||
| 140 | { | ||
| 141 | case ']': return parentype::square_bracket; | ||
| 142 | case ')': return parentype::paren; | ||
| 143 | case '*': return parentype::asterisk; | ||
| 144 | case '"': return parentype::quote; | ||
| 145 | } | ||
| 146 | })(); | ||
| 147 | |||
| 148 | if (tk.delimiters[{pt, doublestatus::opening}] > 0) | ||
| 149 | { | ||
| 150 | tk.delimiters[{pt, doublestatus::opening}]--; | ||
| 151 | tk.delimiters[{pt, doublestatus::both}]++; | ||
| 152 | } else { | ||
| 153 | tk.delimiters[{pt, doublestatus::closing}]++; | ||
| 154 | } | ||
| 155 | } | ||
| 156 | |||
| 157 | if (suffix == ",") | ||
| 158 | { | ||
| 159 | tk.suffix = suffixtype::comma; | ||
| 160 | } else if (!suffix.empty()) { | ||
| 161 | tk.suffix = suffixtype::terminating; | ||
| 162 | |||
| 163 | w.terms.add(suffix); | ||
| 164 | } | ||
| 104 | } | 165 | } |
| 166 | |||
| 167 | tokens.push_back(tk); | ||
| 105 | } | 168 | } |
| 106 | 169 | ||
| 107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 170 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 108 | } | 171 | } |
| 109 | 172 | ||
| 110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | 173 | // Time to condense the distribution stuff for the words |
| 174 | for (auto& it : words) | ||
| 111 | { | 175 | { |
| 112 | hashtags.push_back(*it); | 176 | it.second.forms.compile(); |
| 177 | it.second.terms.compile(); | ||
| 113 | } | 178 | } |
| 114 | 179 | ||
| 180 | // Hashtag freevar is not frequency distributed | ||
| 181 | for (auto& it : thashtags) | ||
| 182 | { | ||
| 183 | hashtags.forms.add(it); | ||
| 184 | } | ||
| 185 | |||
| 186 | hashtags.forms.compile(); | ||
| 187 | hashtags.terms.compile(); | ||
| 188 | |||
| 189 | // kgram distribution | ||
| 115 | std::map<kgram, std::map<token, token_data> > tstats; | 190 | std::map<kgram, std::map<token, token_data> > tstats; |
| 116 | std::map<token, std::map<termstats, int> > tendings; | ||
| 117 | for (int k=1; k<maxK; k++) | 191 | for (int k=1; k<maxK; k++) |
| 118 | { | 192 | { |
| 119 | for (int i=0; i<(tokens.size() - k); i++) | 193 | for (int i=0; i<(tokens.size() - k); i++) |
| 120 | { | 194 | { |
| 121 | std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); | 195 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
| 122 | kgram prefix; | 196 | token f = tokens[i+k]; |
| 123 | 197 | ||
| 124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 198 | if (tstats[prefix].count(f) == 0) |
| 125 | { | ||
| 126 | prefix.push_back(token_from_string(*it)); | ||
| 127 | } | ||
| 128 | |||
| 129 | std::string f = tokens[i+k]; | ||
| 130 | std::string canonical = canonize(f); | ||
| 131 | |||
| 132 | token word(token_from_string(canonical)); | ||
| 133 | if (f.find_first_of(".?!,") != std::string::npos) | ||
| 134 | { | 199 | { |
| 135 | word.terminating = true; | 200 | tstats[prefix].emplace(f, f); |
| 136 | |||
| 137 | char terminator = f[f.find_last_of(".?!,")]; | ||
| 138 | int occurrences = std::count(f.begin(), f.end(), terminator); | ||
| 139 | |||
| 140 | tendings[word][termstats(terminator, occurrences)]++; | ||
| 141 | } | 201 | } |
| 142 | 202 | ||
| 143 | token_data& td = tstats[prefix][word]; | 203 | token_data& td = tstats[prefix].at(f); |
| 144 | td.word = word; | ||
| 145 | td.all++; | 204 | td.all++; |
| 146 | 205 | ||
| 147 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 206 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
| 148 | { | 207 | { |
| 149 | td.uppercase++; | 208 | td.uppercase++; |
| 150 | } else if (isupper(f[0])) | 209 | } else if (isupper(f.raw[0])) |
| 151 | { | 210 | { |
| 152 | td.titlecase++; | 211 | td.titlecase++; |
| 153 | } | 212 | } |
| 154 | 213 | ||
| 155 | if (prefix.front().word.terminating) | 214 | kgram term_prefix; |
| 215 | bool changed = false; | ||
| 216 | std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) { | ||
| 217 | if (q.tok.suffix == suffixtype::terminating) | ||
| 218 | { | ||
| 219 | changed = true; | ||
| 220 | |||
| 221 | return wildcardQuery; | ||
| 222 | } else { | ||
| 223 | return q; | ||
| 224 | } | ||
| 225 | }); | ||
| 226 | |||
| 227 | if (changed) | ||
| 156 | { | 228 | { |
| 157 | prefix.front() = wildcardQuery; | 229 | if (tstats[term_prefix].count(f) == 0) |
| 230 | { | ||
| 231 | tstats[term_prefix].emplace(f, f); | ||
| 232 | } | ||
| 158 | 233 | ||
| 159 | token_data& td2 = tstats[prefix][word]; | 234 | token_data& td2 = tstats[term_prefix].at(f); |
| 160 | td2.word = word; | ||
| 161 | td2.all++; | 235 | td2.all++; |
| 162 | 236 | ||
| 163 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 237 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
| 164 | { | 238 | { |
| 165 | td2.uppercase++; | 239 | td2.uppercase++; |
| 166 | } else if (isupper(f[0])) | 240 | } else if (isupper(f.raw[0])) |
| 167 | { | 241 | { |
| 168 | td2.titlecase++; | 242 | td2.titlecase++; |
| 169 | } | 243 | } |
| @@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 171 | } | 245 | } |
| 172 | } | 246 | } |
| 173 | 247 | ||
| 174 | for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) | 248 | // Condense the kgram distribution |
| 249 | for (auto& it : tstats) | ||
| 175 | { | 250 | { |
| 176 | kgram klist = it->first; | 251 | kgram klist = it.first; |
| 177 | std::map<token, token_data>& probtable = it->second; | 252 | auto& probtable = it.second; |
| 178 | std::map<int, token_data>& distribution = stats[klist]; | 253 | auto& distribution = stats[klist]; |
| 179 | int max = 0; | 254 | int max = 0; |
| 180 | 255 | ||
| 181 | for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | 256 | for (auto& kt : probtable) |
| 182 | { | 257 | { |
| 183 | max += kt->second.all; | 258 | max += kt.second.all; |
| 184 | 259 | ||
| 185 | distribution[max] = kt->second; | 260 | distribution.emplace(max, kt.second); |
| 186 | } | ||
| 187 | } | ||
| 188 | |||
| 189 | for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++) | ||
| 190 | { | ||
| 191 | token word = it->first; | ||
| 192 | std::map<termstats, int>& probtable = it->second; | ||
| 193 | std::map<int, termstats>& distribution = endings[word]; | ||
| 194 | int max = 0; | ||
| 195 | |||
| 196 | for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | ||
| 197 | { | ||
| 198 | max += kt->second; | ||
| 199 | |||
| 200 | distribution[max] = kt->first; | ||
| 201 | } | 261 | } |
| 202 | } | 262 | } |
| 203 | } | 263 | } |
| 204 | 264 | ||
| 205 | void printKgram(kgram k) | 265 | void printKgram(kgram k) |
| 206 | { | 266 | { |
| 207 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 267 | for (auto& q : k) |
| 208 | { | 268 | { |
| 209 | query& q = *it; | 269 | if (q.type == querytype::sentence) |
| 210 | if (q.type == querytype_sentence) | ||
| 211 | { | 270 | { |
| 212 | std::cout << "#.# "; | 271 | std::cout << "#.# "; |
| 213 | } else if (q.type == querytype_literal) | 272 | } else if (q.type == querytype::literal) |
| 214 | { | 273 | { |
| 215 | if (q.word.type == tokentype_hashtag) | 274 | if (q.tok.suffix == suffixtype::terminating) |
| 216 | { | 275 | { |
| 217 | if (q.word.terminating) | 276 | std::cout << q.tok.w.canon << ". "; |
| 218 | { | 277 | } else if (q.tok.suffix == suffixtype::comma) |
| 219 | std::cout << "#hashtag. "; | ||
| 220 | } else { | ||
| 221 | std::cout << "#hashtag "; | ||
| 222 | } | ||
| 223 | } else if (q.word.type == tokentype_literal) | ||
| 224 | { | 278 | { |
| 225 | if (q.word.terminating) | 279 | std::cout << q.tok.w.canon << ", "; |
| 226 | { | 280 | } else { |
| 227 | std::cout << q.word.canon << ". "; | 281 | std::cout << q.tok.w.canon << " "; |
| 228 | } else { | ||
| 229 | std::cout << q.word.canon << " "; | ||
| 230 | } | ||
| 231 | } | 282 | } |
| 232 | } | 283 | } |
| 233 | } | 284 | } |
| 234 | } | 285 | } |
| 235 | 286 | ||
| 236 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 287 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
| 237 | std::vector<std::string> kgramstats::randomSentence(int n) | 288 | std::string kgramstats::randomSentence(int n) |
| 238 | { | 289 | { |
| 239 | std::vector<std::string> result; | 290 | std::string result; |
| 240 | kgram cur(1, wildcardQuery); | 291 | kgram cur(1, wildcardQuery); |
| 241 | int cuts = 0; | 292 | int cuts = 0; |
| 293 | std::stack<parentype> open_delimiters; | ||
| 242 | 294 | ||
| 243 | for (int i=0; i<n; i++) | 295 | for (int i=0; i<n; i++) |
| 244 | { | 296 | { |
| @@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 273 | cur = kgram(1, wildcardQuery); | 325 | cur = kgram(1, wildcardQuery); |
| 274 | } | 326 | } |
| 275 | 327 | ||
| 276 | std::map<int, token_data>& distribution = stats[cur]; | 328 | auto& distribution = stats[cur]; |
| 277 | int max = distribution.rbegin()->first; | 329 | int max = distribution.rbegin()->first; |
| 278 | int r = rand() % max; | 330 | int r = rand() % max; |
| 279 | token_data& next = distribution.upper_bound(r)->second; | 331 | token_data& next = distribution.upper_bound(r)->second; |
| 280 | std::string nextToken; | 332 | std::string nextToken = next.tok.w.forms.next(); |
| 281 | bool mess = false; | 333 | |
| 282 | 334 | // Determine the casing of the next token. We randomly make the token all | |
| 283 | if (next.word.type == tokentype_literal) | 335 | // caps based on the markov chain. Otherwise, we check if the previous |
| 336 | // token is the end of a sentence (terminating token or a wildcard query). | ||
| 337 | int casing = rand() % next.all; | ||
| 338 | if (casing < next.uppercase) | ||
| 339 | { | ||
| 340 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
| 341 | } else if ((((cur.rbegin()->type == querytype::sentence) | ||
| 342 | || ((cur.rbegin()->type == querytype::literal) | ||
| 343 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | ||
| 344 | && (rand() % 2 > 0)) | ||
| 345 | || (casing - next.uppercase < next.titlecase)) | ||
| 284 | { | 346 | { |
| 285 | nextToken = next.word.canon; | 347 | nextToken[0] = toupper(nextToken[0]); |
| 348 | } | ||
| 286 | 349 | ||
| 287 | mess = (rand() % 100) == 0; | 350 | // Delimiters |
| 288 | if (mess) | 351 | for (auto& dt : next.tok.delimiters) |
| 352 | { | ||
| 353 | if (dt.first.status == doublestatus::both) | ||
| 289 | { | 354 | { |
| 290 | nextToken = mstats.alternate(nextToken); | 355 | switch (dt.first.type) |
| 291 | } | 356 | { |
| 292 | 357 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; | |
| 293 | // Determine the casing of the next token. We randomly make the token all | 358 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; |
| 294 | // caps based on the markov chain. Otherwise, we check if the previous | 359 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; |
| 295 | // token is the end of a sentence (terminating token or a wildcard query). | 360 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; |
| 296 | int casing = rand() % next.all; | 361 | } |
| 297 | if (casing < next.uppercase) | 362 | } else if (dt.first.status == doublestatus::opening) |
| 298 | { | 363 | { |
| 299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 364 | for (int i=0; i<dt.second; i++) |
| 300 | } else if ((((cur.rbegin()->type == querytype_sentence) | 365 | { |
| 301 | || ((cur.rbegin()->type == querytype_literal) | 366 | open_delimiters.push(dt.first.type); |
| 302 | && (cur.rbegin()->word.terminating))) | 367 | } |
| 303 | && (rand() % 2 > 0)) | 368 | |
| 304 | || (casing - next.uppercase < next.titlecase)) | 369 | switch (dt.first.type) |
| 370 | { | ||
| 371 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break; | ||
| 372 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break; | ||
| 373 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break; | ||
| 374 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break; | ||
| 375 | } | ||
| 376 | } else if (dt.first.status == doublestatus::closing) | ||
| 305 | { | 377 | { |
| 306 | nextToken[0] = toupper(nextToken[0]); | 378 | for (int i=0; i<dt.second; i++) |
| 379 | { | ||
| 380 | while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type)) | ||
| 381 | { | ||
| 382 | switch (open_delimiters.top()) | ||
| 383 | { | ||
| 384 | case parentype::paren: nextToken.append(")"); break; | ||
| 385 | case parentype::square_bracket: nextToken.append("]"); break; | ||
| 386 | case parentype::asterisk: nextToken.append("*"); break; | ||
| 387 | case parentype::quote: nextToken.append("\""); break; | ||
| 388 | } | ||
| 389 | |||
| 390 | open_delimiters.pop(); | ||
| 391 | } | ||
| 392 | |||
| 393 | if (open_delimiters.empty()) | ||
| 394 | { | ||
| 395 | switch (dt.first.type) | ||
| 396 | { | ||
| 397 | case parentype::paren: result = "(" + result; break; | ||
| 398 | case parentype::square_bracket: result = "[" + result; break; | ||
| 399 | case parentype::asterisk: result = "*" + result; break; | ||
| 400 | case parentype::quote: result = "\"" + result; break; | ||
| 401 | } | ||
| 402 | } | ||
| 403 | |||
| 404 | switch (dt.first.type) | ||
| 405 | { | ||
| 406 | case parentype::paren: nextToken.append(")"); break; | ||
| 407 | case parentype::square_bracket: nextToken.append("]"); break; | ||
| 408 | case parentype::asterisk: nextToken.append("*"); break; | ||
| 409 | case parentype::quote: nextToken.append("\""); break; | ||
| 410 | } | ||
| 411 | } | ||
| 307 | } | 412 | } |
| 308 | } else if (next.word.type == tokentype_hashtag) | ||
| 309 | { | ||
| 310 | int rhash = rand() % hashtags.size(); | ||
| 311 | nextToken = hashtags[rhash]; | ||
| 312 | } | 413 | } |
| 313 | 414 | ||
| 314 | if (next.word.terminating) | 415 | // Terminators |
| 416 | if (next.tok.suffix == suffixtype::terminating) | ||
| 315 | { | 417 | { |
| 316 | std::map<int, termstats>& ending = endings[next.word]; | 418 | nextToken.append(next.tok.w.terms.next()); |
| 317 | int emax = ending.rbegin()->first; | 419 | } else if (next.tok.suffix == suffixtype::comma) |
| 318 | int er = rand() % emax; | 420 | { |
| 319 | termstats& nextend = ending.upper_bound(er)->second; | 421 | nextToken.append(","); |
| 320 | |||
| 321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | ||
| 322 | } | 422 | } |
| 323 | 423 | ||
| 324 | /* DEBUG */ | 424 | /* DEBUG */ |
| 325 | printKgram(cur); | 425 | printKgram(cur); |
| 426 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | ||
| 427 | |||
| 428 | cur.push_back(next.tok); | ||
| 326 | 429 | ||
| 327 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; | 430 | result.append(nextToken + " "); |
| 328 | 431 | ||
| 329 | if (mess) | 432 | if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) |
| 330 | { | 433 | { |
| 331 | std::cout << " mala " << next.word.canon; | 434 | break; |
| 332 | } | 435 | } |
| 333 | |||
| 334 | std::cout << std::endl; | ||
| 335 | |||
| 336 | cur.push_back(next.word); | ||
| 337 | |||
| 338 | result.push_back(nextToken); | ||
| 339 | } | 436 | } |
| 340 | |||
| 341 | return result; | ||
| 342 | } | ||
| 343 | |||
| 344 | bool removeIf(char c) | ||
| 345 | { | ||
| 346 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); | ||
| 347 | } | ||
| 348 | |||
| 349 | std::string canonize(std::string f) | ||
| 350 | { | ||
| 351 | std::string canonical(f); | ||
| 352 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
| 353 | 437 | ||
| 354 | std::string result; | 438 | // Remove the trailing space |
| 355 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 439 | if (result.back() == ' ') |
| 440 | { | ||
| 441 | result.pop_back(); | ||
| 442 | } | ||
| 443 | |||
| 444 | // Close any open delimiters | ||
| 445 | while (!open_delimiters.empty()) | ||
| 446 | { | ||
| 447 | switch (open_delimiters.top()) | ||
| 448 | { | ||
| 449 | case parentype::paren: result.append(")"); break; | ||
| 450 | case parentype::square_bracket: result.append("]"); break; | ||
| 451 | case parentype::asterisk: result.append("*"); break; | ||
| 452 | case parentype::quote: result.append("\""); break; | ||
| 453 | } | ||
| 454 | |||
| 455 | open_delimiters.pop(); | ||
| 456 | } | ||
| 356 | 457 | ||
| 357 | return result; | 458 | return result; |
| 358 | } | 459 | } |
| diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -2,61 +2,89 @@ | |||
| 2 | #include <map> | 2 | #include <map> |
| 3 | #include <list> | 3 | #include <list> |
| 4 | #include <vector> | 4 | #include <vector> |
| 5 | #include "malaprop.h" | 5 | #include "histogram.h" |
| 6 | 6 | ||
| 7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 9 | 9 | ||
| 10 | enum tokentype { | 10 | struct word { |
| 11 | tokentype_literal, | 11 | std::string canon; |
| 12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
| 13 | histogram<std::string> terms; | ||
| 14 | |||
| 15 | word(std::string canon) : canon(canon) {} | ||
| 16 | |||
| 17 | bool operator<(const word& other) const | ||
| 18 | { | ||
| 19 | return canon < other.canon; | ||
| 20 | } | ||
| 13 | }; | 21 | }; |
| 14 | 22 | ||
| 15 | struct token { | 23 | extern word blank_word; |
| 16 | tokentype type; | 24 | |
| 17 | std::string canon; | 25 | enum class suffixtype { |
| 18 | bool terminating; | 26 | none, |
| 27 | terminating, | ||
| 28 | comma | ||
| 29 | }; | ||
| 30 | |||
| 31 | enum class parentype { | ||
| 32 | paren, | ||
| 33 | square_bracket, | ||
| 34 | asterisk, | ||
| 35 | quote | ||
| 36 | }; | ||
| 37 | |||
| 38 | enum class doublestatus { | ||
| 39 | opening, | ||
| 40 | closing, | ||
| 41 | both | ||
| 42 | }; | ||
| 43 | |||
| 44 | struct delimiter { | ||
| 45 | parentype type; | ||
| 46 | doublestatus status; | ||
| 47 | |||
| 48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
| 19 | 49 | ||
| 20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
| 21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
| 52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
| 53 | } | ||
| 54 | }; | ||
| 55 | |||
| 56 | struct token { | ||
| 57 | const word& w; | ||
| 58 | std::map<delimiter, int> delimiters; | ||
| 59 | suffixtype suffix; | ||
| 60 | std::string raw; | ||
| 61 | |||
| 62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
| 22 | 63 | ||
| 23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
| 24 | { | 65 | { |
| 25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
| 26 | { | ||
| 27 | return type < other.type; | ||
| 28 | } else if (type == tokentype_literal) | ||
| 29 | { | ||
| 30 | if (canon == other.canon) | ||
| 31 | { | ||
| 32 | return !terminating && other.terminating; | ||
| 33 | } else { | ||
| 34 | return canon < other.canon; | ||
| 35 | } | ||
| 36 | } else { | ||
| 37 | return !terminating && other.terminating; | ||
| 38 | } | ||
| 39 | } | 67 | } |
| 40 | }; | 68 | }; |
| 41 | 69 | ||
| 42 | enum querytype { | 70 | enum class querytype { |
| 43 | querytype_literal, | 71 | literal, |
| 44 | querytype_sentence | 72 | sentence |
| 45 | }; | 73 | }; |
| 46 | 74 | ||
| 47 | struct query { | 75 | struct query { |
| 48 | querytype type; | 76 | querytype type; |
| 49 | token word; | 77 | token tok; |
| 50 | 78 | ||
| 51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
| 52 | 80 | ||
| 53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
| 54 | 82 | ||
| 55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
| 56 | { | 84 | { |
| 57 | if (type == other.type) | 85 | if (type == other.type) |
| 58 | { | 86 | { |
| 59 | return word < other.word; | 87 | return tok < other.tok; |
| 60 | } else { | 88 | } else { |
| 61 | return type < other.type; | 89 | return type < other.type; |
| 62 | } | 90 | } |
| @@ -65,34 +93,11 @@ struct query { | |||
| 65 | 93 | ||
| 66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
| 67 | 95 | ||
| 68 | struct termstats { | ||
| 69 | char terminator; | ||
| 70 | int occurrences; | ||
| 71 | |||
| 72 | termstats() : terminator('.'), occurrences(1) {} | ||
| 73 | |||
| 74 | termstats(char terminator, int occurrences) | ||
| 75 | { | ||
| 76 | this->terminator = terminator; | ||
| 77 | this->occurrences = occurrences; | ||
| 78 | } | ||
| 79 | |||
| 80 | bool operator<(const termstats& other) const | ||
| 81 | { | ||
| 82 | if (terminator == other.terminator) | ||
| 83 | { | ||
| 84 | return occurrences < other.occurrences; | ||
| 85 | } else { | ||
| 86 | return terminator < other.terminator; | ||
| 87 | } | ||
| 88 | } | ||
| 89 | }; | ||
| 90 | |||
| 91 | class kgramstats | 96 | class kgramstats |
| 92 | { | 97 | { |
| 93 | public: | 98 | public: |
| 94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
| 95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
| 96 | 101 | ||
| 97 | private: | 102 | private: |
| 98 | struct token_data | 103 | struct token_data |
| @@ -100,16 +105,15 @@ private: | |||
| 100 | int all; | 105 | int all; |
| 101 | int titlecase; | 106 | int titlecase; |
| 102 | int uppercase; | 107 | int uppercase; |
| 103 | token word; | 108 | token tok; |
| 104 | 109 | ||
| 105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
| 106 | }; | 111 | }; |
| 107 | 112 | ||
| 108 | int maxK; | 113 | int maxK; |
| 109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
| 110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
| 111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
| 112 | std::vector<std::string> hashtags; | ||
| 113 | }; | 117 | }; |
| 114 | 118 | ||
| 115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |
