diff options
-rw-r--r-- | CMakeLists.txt | 8 | ||||
-rw-r--r-- | ebooks.cpp | 15 | ||||
-rw-r--r-- | freevars.cpp | 4 | ||||
-rw-r--r-- | gen.cpp | 15 | ||||
-rw-r--r-- | histogram.cpp | 34 | ||||
-rw-r--r-- | histogram.h | 19 | ||||
-rw-r--r-- | kgramstats.cpp | 453 | ||||
-rw-r--r-- | kgramstats.h | 124 |
8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
@@ -8,10 +8,14 @@ find_package(curl) | |||
8 | if (YamlCpp_FOUND AND CURL_FOUND) | 8 | if (YamlCpp_FOUND AND CURL_FOUND) |
9 | add_subdirectory(vendor/twitcurl/libtwitcurl) | 9 | add_subdirectory(vendor/twitcurl/libtwitcurl) |
10 | include_directories(vendor/twitcurl/libtwitcurl) | 10 | include_directories(vendor/twitcurl/libtwitcurl) |
11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
12 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) | ||
13 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) | ||
12 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) | 14 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) |
13 | else (YamlCpp_FOUND AND CURL_FOUND) | 15 | else (YamlCpp_FOUND AND CURL_FOUND) |
14 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") | 16 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") |
15 | endif (YamlCpp_FOUND AND CURL_FOUND) | 17 | endif (YamlCpp_FOUND AND CURL_FOUND) |
16 | 18 | ||
17 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 19 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
20 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) | ||
21 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) | ||
diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -44,20 +44,9 @@ int main(int argc, char** args) | |||
44 | std::cout << "Generating..." << std::endl; | 44 | std::cout << "Generating..." << std::endl; |
45 | for (;;) | 45 | for (;;) |
46 | { | 46 | { |
47 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); | 47 | std::string doc = stats->randomSentence(rand() % 45 + 5); |
48 | std::string hi; | 48 | std::string hi = vars->parse(doc); |
49 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
50 | { | ||
51 | hi += vars->parse(*it) + " "; | ||
52 | } | ||
53 | |||
54 | hi.resize(140); | 49 | hi.resize(140); |
55 | |||
56 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
58 | { | ||
59 | hi = hi.substr(0, lastperiod+1); | ||
60 | } | ||
61 | 50 | ||
62 | std::string replyMsg; | 51 | std::string replyMsg; |
63 | if (twitter.statusUpdate(hi)) | 52 | if (twitter.statusUpdate(hi)) |
diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp | |||
@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in) | |||
34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) | 34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) |
35 | { | 35 | { |
36 | std::string tofind = "$" + it->first + "$"; | 36 | std::string tofind = "$" + it->first + "$"; |
37 | size_t fpos = res.find(tofind); | 37 | size_t fpos; |
38 | if (fpos != std::string::npos) | 38 | while ((fpos = res.find(tofind)) != std::string::npos) |
39 | { | 39 | { |
40 | int r = rand() % it->second->size(); | 40 | int r = rand() % it->second->size(); |
41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); | 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); |
diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -52,21 +52,10 @@ int main(int argc, char** args) | |||
52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
53 | for (;;) | 53 | for (;;) |
54 | { | 54 | { |
55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::string doc = stats->randomSentence(rand() % 35 + 15); |
56 | std::string hi; | 56 | std::string hi = vars->parse(doc); |
57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
58 | { | ||
59 | hi += vars->parse(*it) + " "; | ||
60 | } | ||
61 | |||
62 | hi.resize(140); | 57 | hi.resize(140); |
63 | 58 | ||
64 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
65 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
66 | { | ||
67 | hi = hi.substr(0, lastperiod+1); | ||
68 | } | ||
69 | |||
70 | std::cout << hi << std::endl; | 59 | std::cout << hi << std::endl; |
71 | 60 | ||
72 | getc(stdin); | 61 | getc(stdin); |
diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp | |||
@@ -0,0 +1,34 @@ | |||
1 | #include "histogram.h" | ||
2 | #include <cstdlib> | ||
3 | |||
4 | template <class T> | ||
5 | void histogram<T>::add(const T& inst) | ||
6 | { | ||
7 | freqtable[inst]++; | ||
8 | } | ||
9 | |||
10 | template <class T> | ||
11 | void histogram<T>::compile() | ||
12 | { | ||
13 | distribution.clear(); | ||
14 | |||
15 | int max = 0; | ||
16 | for (auto& it : freqtable) | ||
17 | { | ||
18 | max += it.second; | ||
19 | distribution.emplace(max, it.first); | ||
20 | } | ||
21 | |||
22 | freqtable.clear(); | ||
23 | } | ||
24 | |||
25 | template <class T> | ||
26 | const T& histogram<T>::next() const | ||
27 | { | ||
28 | int max = distribution.rbegin()->first; | ||
29 | int r = rand() % max; | ||
30 | |||
31 | return distribution.upper_bound(r)->second; | ||
32 | } | ||
33 | |||
34 | template class histogram <std::string>; | ||
diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h | |||
@@ -0,0 +1,19 @@ | |||
1 | #ifndef HISTOGRAM_H_24094D97 | ||
2 | #define HISTOGRAM_H_24094D97 | ||
3 | |||
4 | #include <map> | ||
5 | #include <string> | ||
6 | |||
7 | template <class T> | ||
8 | class histogram { | ||
9 | public: | ||
10 | void add(const T& inst); | ||
11 | void compile(); | ||
12 | const T& next() const; | ||
13 | |||
14 | private: | ||
15 | std::map<T, int> freqtable; | ||
16 | std::map<int, T> distribution; | ||
17 | }; | ||
18 | |||
19 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -37,35 +37,11 @@ | |||
37 | #include <iostream> | 37 | #include <iostream> |
38 | #include <cstdlib> | 38 | #include <cstdlib> |
39 | #include <algorithm> | 39 | #include <algorithm> |
40 | #include "malaprop.h" | 40 | #include <set> |
41 | #include <stack> | ||
41 | 42 | ||
42 | query wildcardQuery(querytype_sentence); | 43 | query wildcardQuery {querytype::sentence}; |
43 | 44 | word blank_word {""}; | |
44 | std::string canonize(std::string f); | ||
45 | |||
46 | token token_from_string(std::string in) | ||
47 | { | ||
48 | if (in[0] == '#') | ||
49 | { | ||
50 | token word(tokentype_hashtag); | ||
51 | |||
52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
53 | { | ||
54 | word.terminating = true; | ||
55 | } | ||
56 | |||
57 | return word; | ||
58 | } else { | ||
59 | token word(canonize(in)); | ||
60 | |||
61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
62 | { | ||
63 | word.terminating = true; | ||
64 | } | ||
65 | |||
66 | return word; | ||
67 | } | ||
68 | } | ||
69 | 45 | ||
70 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 46 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
71 | // We consider maxK to be fairly constant | 47 | // We consider maxK to be fairly constant |
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
73 | { | 49 | { |
74 | this->maxK = maxK; | 50 | this->maxK = maxK; |
75 | 51 | ||
76 | std::vector<std::string> tokens; | 52 | std::vector<token> tokens; |
77 | size_t start = 0; | 53 | size_t start = 0; |
78 | int end = 0; | 54 | int end = 0; |
79 | std::set<std::string> thashtags; | 55 | std::set<std::string> thashtags; |
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
82 | { | 58 | { |
83 | end = corpus.find(" ", start); | 59 | end = corpus.find(" ", start); |
84 | 60 | ||
85 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
86 | if (token[token.length()-1] == '\n') | 62 | if (t.compare("") && t.compare(".")) |
87 | { | 63 | { |
88 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) | 64 | std::string tc(t), canonical; |
65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | ||
66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | ||
67 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
68 | }); | ||
69 | |||
70 | word& w = ([&] () -> word& { | ||
71 | // Hashtag freevar | ||
72 | if (canonical[0] == '#') | ||
73 | { | ||
74 | thashtags.insert(canonical); | ||
75 | canonical = "#hashtag"; | ||
76 | |||
77 | return hashtags; | ||
78 | } | ||
79 | |||
80 | // Basically any other word | ||
81 | if (words.count(canonical) == 0) | ||
82 | { | ||
83 | words.emplace(canonical, canonical); | ||
84 | } | ||
85 | |||
86 | word& tw = words.at(canonical); | ||
87 | tw.forms.add(canonical); | ||
88 | |||
89 | return tw; | ||
90 | })(); | ||
91 | |||
92 | token tk(w); | ||
93 | tk.raw = t; | ||
94 | |||
95 | for (char c : t) | ||
89 | { | 96 | { |
90 | token.insert(token.length()-1, "."); | 97 | if (c == '*') |
98 | { | ||
99 | tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; | ||
100 | } else if (c == '[') | ||
101 | { | ||
102 | tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; | ||
103 | } else if (c == '(') | ||
104 | { | ||
105 | tk.delimiters[{parentype::paren, doublestatus::opening}]++; | ||
106 | } else if (c == '"') | ||
107 | { | ||
108 | tk.delimiters[{parentype::quote, doublestatus::opening}]++; | ||
109 | } else { | ||
110 | break; | ||
111 | } | ||
91 | } | 112 | } |
92 | |||
93 | token.resize(token.length()-1); | ||
94 | } | ||
95 | |||
96 | if (token.compare("") && token.compare(".")) | ||
97 | { | ||
98 | mstats.addWord(token); | ||
99 | tokens.push_back(token); | ||
100 | 113 | ||
101 | if (token[0] == '#') | 114 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; |
115 | if (backtrack != t.length()) | ||
102 | { | 116 | { |
103 | thashtags.insert(canonize(token)); | 117 | std::string ending = t.substr(backtrack); |
118 | std::string suffix; | ||
119 | |||
120 | for (char c : ending) | ||
121 | { | ||
122 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | ||
123 | { | ||
124 | suffix += c; | ||
125 | |||
126 | continue; | ||
127 | } else if (c == '\n') | ||
128 | { | ||
129 | // At least the end is coming | ||
130 | if (suffix.empty()) | ||
131 | { | ||
132 | suffix = "."; | ||
133 | } | ||
134 | |||
135 | break; | ||
136 | } | ||
137 | |||
138 | parentype pt = ([&] { | ||
139 | switch (c) | ||
140 | { | ||
141 | case ']': return parentype::square_bracket; | ||
142 | case ')': return parentype::paren; | ||
143 | case '*': return parentype::asterisk; | ||
144 | case '"': return parentype::quote; | ||
145 | } | ||
146 | })(); | ||
147 | |||
148 | if (tk.delimiters[{pt, doublestatus::opening}] > 0) | ||
149 | { | ||
150 | tk.delimiters[{pt, doublestatus::opening}]--; | ||
151 | tk.delimiters[{pt, doublestatus::both}]++; | ||
152 | } else { | ||
153 | tk.delimiters[{pt, doublestatus::closing}]++; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | if (suffix == ",") | ||
158 | { | ||
159 | tk.suffix = suffixtype::comma; | ||
160 | } else if (!suffix.empty()) { | ||
161 | tk.suffix = suffixtype::terminating; | ||
162 | |||
163 | w.terms.add(suffix); | ||
164 | } | ||
104 | } | 165 | } |
166 | |||
167 | tokens.push_back(tk); | ||
105 | } | 168 | } |
106 | 169 | ||
107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 170 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
108 | } | 171 | } |
109 | 172 | ||
110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | 173 | // Time to condense the distribution stuff for the words |
174 | for (auto& it : words) | ||
111 | { | 175 | { |
112 | hashtags.push_back(*it); | 176 | it.second.forms.compile(); |
177 | it.second.terms.compile(); | ||
113 | } | 178 | } |
114 | 179 | ||
180 | // Hashtag freevar is not frequency distributed | ||
181 | for (auto& it : thashtags) | ||
182 | { | ||
183 | hashtags.forms.add(it); | ||
184 | } | ||
185 | |||
186 | hashtags.forms.compile(); | ||
187 | hashtags.terms.compile(); | ||
188 | |||
189 | // kgram distribution | ||
115 | std::map<kgram, std::map<token, token_data> > tstats; | 190 | std::map<kgram, std::map<token, token_data> > tstats; |
116 | std::map<token, std::map<termstats, int> > tendings; | ||
117 | for (int k=1; k<maxK; k++) | 191 | for (int k=1; k<maxK; k++) |
118 | { | 192 | { |
119 | for (int i=0; i<(tokens.size() - k); i++) | 193 | for (int i=0; i<(tokens.size() - k); i++) |
120 | { | 194 | { |
121 | std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); | 195 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
122 | kgram prefix; | 196 | token f = tokens[i+k]; |
123 | 197 | ||
124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 198 | if (tstats[prefix].count(f) == 0) |
125 | { | ||
126 | prefix.push_back(token_from_string(*it)); | ||
127 | } | ||
128 | |||
129 | std::string f = tokens[i+k]; | ||
130 | std::string canonical = canonize(f); | ||
131 | |||
132 | token word(token_from_string(canonical)); | ||
133 | if (f.find_first_of(".?!,") != std::string::npos) | ||
134 | { | 199 | { |
135 | word.terminating = true; | 200 | tstats[prefix].emplace(f, f); |
136 | |||
137 | char terminator = f[f.find_last_of(".?!,")]; | ||
138 | int occurrences = std::count(f.begin(), f.end(), terminator); | ||
139 | |||
140 | tendings[word][termstats(terminator, occurrences)]++; | ||
141 | } | 201 | } |
142 | 202 | ||
143 | token_data& td = tstats[prefix][word]; | 203 | token_data& td = tstats[prefix].at(f); |
144 | td.word = word; | ||
145 | td.all++; | 204 | td.all++; |
146 | 205 | ||
147 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 206 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
148 | { | 207 | { |
149 | td.uppercase++; | 208 | td.uppercase++; |
150 | } else if (isupper(f[0])) | 209 | } else if (isupper(f.raw[0])) |
151 | { | 210 | { |
152 | td.titlecase++; | 211 | td.titlecase++; |
153 | } | 212 | } |
154 | 213 | ||
155 | if (prefix.front().word.terminating) | 214 | kgram term_prefix; |
215 | bool changed = false; | ||
216 | std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) { | ||
217 | if (q.tok.suffix == suffixtype::terminating) | ||
218 | { | ||
219 | changed = true; | ||
220 | |||
221 | return wildcardQuery; | ||
222 | } else { | ||
223 | return q; | ||
224 | } | ||
225 | }); | ||
226 | |||
227 | if (changed) | ||
156 | { | 228 | { |
157 | prefix.front() = wildcardQuery; | 229 | if (tstats[term_prefix].count(f) == 0) |
230 | { | ||
231 | tstats[term_prefix].emplace(f, f); | ||
232 | } | ||
158 | 233 | ||
159 | token_data& td2 = tstats[prefix][word]; | 234 | token_data& td2 = tstats[term_prefix].at(f); |
160 | td2.word = word; | ||
161 | td2.all++; | 235 | td2.all++; |
162 | 236 | ||
163 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 237 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
164 | { | 238 | { |
165 | td2.uppercase++; | 239 | td2.uppercase++; |
166 | } else if (isupper(f[0])) | 240 | } else if (isupper(f.raw[0])) |
167 | { | 241 | { |
168 | td2.titlecase++; | 242 | td2.titlecase++; |
169 | } | 243 | } |
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
171 | } | 245 | } |
172 | } | 246 | } |
173 | 247 | ||
174 | for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) | 248 | // Condense the kgram distribution |
249 | for (auto& it : tstats) | ||
175 | { | 250 | { |
176 | kgram klist = it->first; | 251 | kgram klist = it.first; |
177 | std::map<token, token_data>& probtable = it->second; | 252 | auto& probtable = it.second; |
178 | std::map<int, token_data>& distribution = stats[klist]; | 253 | auto& distribution = stats[klist]; |
179 | int max = 0; | 254 | int max = 0; |
180 | 255 | ||
181 | for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | 256 | for (auto& kt : probtable) |
182 | { | 257 | { |
183 | max += kt->second.all; | 258 | max += kt.second.all; |
184 | 259 | ||
185 | distribution[max] = kt->second; | 260 | distribution.emplace(max, kt.second); |
186 | } | ||
187 | } | ||
188 | |||
189 | for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++) | ||
190 | { | ||
191 | token word = it->first; | ||
192 | std::map<termstats, int>& probtable = it->second; | ||
193 | std::map<int, termstats>& distribution = endings[word]; | ||
194 | int max = 0; | ||
195 | |||
196 | for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | ||
197 | { | ||
198 | max += kt->second; | ||
199 | |||
200 | distribution[max] = kt->first; | ||
201 | } | 261 | } |
202 | } | 262 | } |
203 | } | 263 | } |
204 | 264 | ||
205 | void printKgram(kgram k) | 265 | void printKgram(kgram k) |
206 | { | 266 | { |
207 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 267 | for (auto& q : k) |
208 | { | 268 | { |
209 | query& q = *it; | 269 | if (q.type == querytype::sentence) |
210 | if (q.type == querytype_sentence) | ||
211 | { | 270 | { |
212 | std::cout << "#.# "; | 271 | std::cout << "#.# "; |
213 | } else if (q.type == querytype_literal) | 272 | } else if (q.type == querytype::literal) |
214 | { | 273 | { |
215 | if (q.word.type == tokentype_hashtag) | 274 | if (q.tok.suffix == suffixtype::terminating) |
216 | { | 275 | { |
217 | if (q.word.terminating) | 276 | std::cout << q.tok.w.canon << ". "; |
218 | { | 277 | } else if (q.tok.suffix == suffixtype::comma) |
219 | std::cout << "#hashtag. "; | ||
220 | } else { | ||
221 | std::cout << "#hashtag "; | ||
222 | } | ||
223 | } else if (q.word.type == tokentype_literal) | ||
224 | { | 278 | { |
225 | if (q.word.terminating) | 279 | std::cout << q.tok.w.canon << ", "; |
226 | { | 280 | } else { |
227 | std::cout << q.word.canon << ". "; | 281 | std::cout << q.tok.w.canon << " "; |
228 | } else { | ||
229 | std::cout << q.word.canon << " "; | ||
230 | } | ||
231 | } | 282 | } |
232 | } | 283 | } |
233 | } | 284 | } |
234 | } | 285 | } |
235 | 286 | ||
236 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 287 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
237 | std::vector<std::string> kgramstats::randomSentence(int n) | 288 | std::string kgramstats::randomSentence(int n) |
238 | { | 289 | { |
239 | std::vector<std::string> result; | 290 | std::string result; |
240 | kgram cur(1, wildcardQuery); | 291 | kgram cur(1, wildcardQuery); |
241 | int cuts = 0; | 292 | int cuts = 0; |
293 | std::stack<parentype> open_delimiters; | ||
242 | 294 | ||
243 | for (int i=0; i<n; i++) | 295 | for (int i=0; i<n; i++) |
244 | { | 296 | { |
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
273 | cur = kgram(1, wildcardQuery); | 325 | cur = kgram(1, wildcardQuery); |
274 | } | 326 | } |
275 | 327 | ||
276 | std::map<int, token_data>& distribution = stats[cur]; | 328 | auto& distribution = stats[cur]; |
277 | int max = distribution.rbegin()->first; | 329 | int max = distribution.rbegin()->first; |
278 | int r = rand() % max; | 330 | int r = rand() % max; |
279 | token_data& next = distribution.upper_bound(r)->second; | 331 | token_data& next = distribution.upper_bound(r)->second; |
280 | std::string nextToken; | 332 | std::string nextToken = next.tok.w.forms.next(); |
281 | bool mess = false; | 333 | |
282 | 334 | // Determine the casing of the next token. We randomly make the token all | |
283 | if (next.word.type == tokentype_literal) | 335 | // caps based on the markov chain. Otherwise, we check if the previous |
336 | // token is the end of a sentence (terminating token or a wildcard query). | ||
337 | int casing = rand() % next.all; | ||
338 | if (casing < next.uppercase) | ||
339 | { | ||
340 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
341 | } else if ((((cur.rbegin()->type == querytype::sentence) | ||
342 | || ((cur.rbegin()->type == querytype::literal) | ||
343 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | ||
344 | && (rand() % 2 > 0)) | ||
345 | || (casing - next.uppercase < next.titlecase)) | ||
284 | { | 346 | { |
285 | nextToken = next.word.canon; | 347 | nextToken[0] = toupper(nextToken[0]); |
348 | } | ||
286 | 349 | ||
287 | mess = (rand() % 100) == 0; | 350 | // Delimiters |
288 | if (mess) | 351 | for (auto& dt : next.tok.delimiters) |
352 | { | ||
353 | if (dt.first.status == doublestatus::both) | ||
289 | { | 354 | { |
290 | nextToken = mstats.alternate(nextToken); | 355 | switch (dt.first.type) |
291 | } | 356 | { |
292 | 357 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; | |
293 | // Determine the casing of the next token. We randomly make the token all | 358 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; |
294 | // caps based on the markov chain. Otherwise, we check if the previous | 359 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; |
295 | // token is the end of a sentence (terminating token or a wildcard query). | 360 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; |
296 | int casing = rand() % next.all; | 361 | } |
297 | if (casing < next.uppercase) | 362 | } else if (dt.first.status == doublestatus::opening) |
298 | { | 363 | { |
299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 364 | for (int i=0; i<dt.second; i++) |
300 | } else if ((((cur.rbegin()->type == querytype_sentence) | 365 | { |
301 | || ((cur.rbegin()->type == querytype_literal) | 366 | open_delimiters.push(dt.first.type); |
302 | && (cur.rbegin()->word.terminating))) | 367 | } |
303 | && (rand() % 2 > 0)) | 368 | |
304 | || (casing - next.uppercase < next.titlecase)) | 369 | switch (dt.first.type) |
370 | { | ||
371 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break; | ||
372 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break; | ||
373 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break; | ||
374 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break; | ||
375 | } | ||
376 | } else if (dt.first.status == doublestatus::closing) | ||
305 | { | 377 | { |
306 | nextToken[0] = toupper(nextToken[0]); | 378 | for (int i=0; i<dt.second; i++) |
379 | { | ||
380 | while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type)) | ||
381 | { | ||
382 | switch (open_delimiters.top()) | ||
383 | { | ||
384 | case parentype::paren: nextToken.append(")"); break; | ||
385 | case parentype::square_bracket: nextToken.append("]"); break; | ||
386 | case parentype::asterisk: nextToken.append("*"); break; | ||
387 | case parentype::quote: nextToken.append("\""); break; | ||
388 | } | ||
389 | |||
390 | open_delimiters.pop(); | ||
391 | } | ||
392 | |||
393 | if (open_delimiters.empty()) | ||
394 | { | ||
395 | switch (dt.first.type) | ||
396 | { | ||
397 | case parentype::paren: result = "(" + result; break; | ||
398 | case parentype::square_bracket: result = "[" + result; break; | ||
399 | case parentype::asterisk: result = "*" + result; break; | ||
400 | case parentype::quote: result = "\"" + result; break; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | switch (dt.first.type) | ||
405 | { | ||
406 | case parentype::paren: nextToken.append(")"); break; | ||
407 | case parentype::square_bracket: nextToken.append("]"); break; | ||
408 | case parentype::asterisk: nextToken.append("*"); break; | ||
409 | case parentype::quote: nextToken.append("\""); break; | ||
410 | } | ||
411 | } | ||
307 | } | 412 | } |
308 | } else if (next.word.type == tokentype_hashtag) | ||
309 | { | ||
310 | int rhash = rand() % hashtags.size(); | ||
311 | nextToken = hashtags[rhash]; | ||
312 | } | 413 | } |
313 | 414 | ||
314 | if (next.word.terminating) | 415 | // Terminators |
416 | if (next.tok.suffix == suffixtype::terminating) | ||
315 | { | 417 | { |
316 | std::map<int, termstats>& ending = endings[next.word]; | 418 | nextToken.append(next.tok.w.terms.next()); |
317 | int emax = ending.rbegin()->first; | 419 | } else if (next.tok.suffix == suffixtype::comma) |
318 | int er = rand() % emax; | 420 | { |
319 | termstats& nextend = ending.upper_bound(er)->second; | 421 | nextToken.append(","); |
320 | |||
321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | ||
322 | } | 422 | } |
323 | 423 | ||
324 | /* DEBUG */ | 424 | /* DEBUG */ |
325 | printKgram(cur); | 425 | printKgram(cur); |
426 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | ||
427 | |||
428 | cur.push_back(next.tok); | ||
326 | 429 | ||
327 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; | 430 | result.append(nextToken + " "); |
328 | 431 | ||
329 | if (mess) | 432 | if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) |
330 | { | 433 | { |
331 | std::cout << " mala " << next.word.canon; | 434 | break; |
332 | } | 435 | } |
333 | |||
334 | std::cout << std::endl; | ||
335 | |||
336 | cur.push_back(next.word); | ||
337 | |||
338 | result.push_back(nextToken); | ||
339 | } | 436 | } |
340 | |||
341 | return result; | ||
342 | } | ||
343 | |||
344 | bool removeIf(char c) | ||
345 | { | ||
346 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); | ||
347 | } | ||
348 | |||
349 | std::string canonize(std::string f) | ||
350 | { | ||
351 | std::string canonical(f); | ||
352 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
353 | 437 | ||
354 | std::string result; | 438 | // Remove the trailing space |
355 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 439 | if (result.back() == ' ') |
440 | { | ||
441 | result.pop_back(); | ||
442 | } | ||
443 | |||
444 | // Close any open delimiters | ||
445 | while (!open_delimiters.empty()) | ||
446 | { | ||
447 | switch (open_delimiters.top()) | ||
448 | { | ||
449 | case parentype::paren: result.append(")"); break; | ||
450 | case parentype::square_bracket: result.append("]"); break; | ||
451 | case parentype::asterisk: result.append("*"); break; | ||
452 | case parentype::quote: result.append("\""); break; | ||
453 | } | ||
454 | |||
455 | open_delimiters.pop(); | ||
456 | } | ||
356 | 457 | ||
357 | return result; | 458 | return result; |
358 | } | 459 | } |
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -2,61 +2,89 @@ | |||
2 | #include <map> | 2 | #include <map> |
3 | #include <list> | 3 | #include <list> |
4 | #include <vector> | 4 | #include <vector> |
5 | #include "malaprop.h" | 5 | #include "histogram.h" |
6 | 6 | ||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | enum tokentype { | 10 | struct word { |
11 | tokentype_literal, | 11 | std::string canon; |
12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
13 | histogram<std::string> terms; | ||
14 | |||
15 | word(std::string canon) : canon(canon) {} | ||
16 | |||
17 | bool operator<(const word& other) const | ||
18 | { | ||
19 | return canon < other.canon; | ||
20 | } | ||
13 | }; | 21 | }; |
14 | 22 | ||
15 | struct token { | 23 | extern word blank_word; |
16 | tokentype type; | 24 | |
17 | std::string canon; | 25 | enum class suffixtype { |
18 | bool terminating; | 26 | none, |
27 | terminating, | ||
28 | comma | ||
29 | }; | ||
30 | |||
31 | enum class parentype { | ||
32 | paren, | ||
33 | square_bracket, | ||
34 | asterisk, | ||
35 | quote | ||
36 | }; | ||
37 | |||
38 | enum class doublestatus { | ||
39 | opening, | ||
40 | closing, | ||
41 | both | ||
42 | }; | ||
43 | |||
44 | struct delimiter { | ||
45 | parentype type; | ||
46 | doublestatus status; | ||
47 | |||
48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
19 | 49 | ||
20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
53 | } | ||
54 | }; | ||
55 | |||
56 | struct token { | ||
57 | const word& w; | ||
58 | std::map<delimiter, int> delimiters; | ||
59 | suffixtype suffix; | ||
60 | std::string raw; | ||
61 | |||
62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
22 | 63 | ||
23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
24 | { | 65 | { |
25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
26 | { | ||
27 | return type < other.type; | ||
28 | } else if (type == tokentype_literal) | ||
29 | { | ||
30 | if (canon == other.canon) | ||
31 | { | ||
32 | return !terminating && other.terminating; | ||
33 | } else { | ||
34 | return canon < other.canon; | ||
35 | } | ||
36 | } else { | ||
37 | return !terminating && other.terminating; | ||
38 | } | ||
39 | } | 67 | } |
40 | }; | 68 | }; |
41 | 69 | ||
42 | enum querytype { | 70 | enum class querytype { |
43 | querytype_literal, | 71 | literal, |
44 | querytype_sentence | 72 | sentence |
45 | }; | 73 | }; |
46 | 74 | ||
47 | struct query { | 75 | struct query { |
48 | querytype type; | 76 | querytype type; |
49 | token word; | 77 | token tok; |
50 | 78 | ||
51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
52 | 80 | ||
53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
54 | 82 | ||
55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
56 | { | 84 | { |
57 | if (type == other.type) | 85 | if (type == other.type) |
58 | { | 86 | { |
59 | return word < other.word; | 87 | return tok < other.tok; |
60 | } else { | 88 | } else { |
61 | return type < other.type; | 89 | return type < other.type; |
62 | } | 90 | } |
@@ -65,34 +93,11 @@ struct query { | |||
65 | 93 | ||
66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
67 | 95 | ||
68 | struct termstats { | ||
69 | char terminator; | ||
70 | int occurrences; | ||
71 | |||
72 | termstats() : terminator('.'), occurrences(1) {} | ||
73 | |||
74 | termstats(char terminator, int occurrences) | ||
75 | { | ||
76 | this->terminator = terminator; | ||
77 | this->occurrences = occurrences; | ||
78 | } | ||
79 | |||
80 | bool operator<(const termstats& other) const | ||
81 | { | ||
82 | if (terminator == other.terminator) | ||
83 | { | ||
84 | return occurrences < other.occurrences; | ||
85 | } else { | ||
86 | return terminator < other.terminator; | ||
87 | } | ||
88 | } | ||
89 | }; | ||
90 | |||
91 | class kgramstats | 96 | class kgramstats |
92 | { | 97 | { |
93 | public: | 98 | public: |
94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
96 | 101 | ||
97 | private: | 102 | private: |
98 | struct token_data | 103 | struct token_data |
@@ -100,16 +105,15 @@ private: | |||
100 | int all; | 105 | int all; |
101 | int titlecase; | 106 | int titlecase; |
102 | int uppercase; | 107 | int uppercase; |
103 | token word; | 108 | token tok; |
104 | 109 | ||
105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
106 | }; | 111 | }; |
107 | 112 | ||
108 | int maxK; | 113 | int maxK; |
109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
112 | std::vector<std::string> hashtags; | ||
113 | }; | 117 | }; |
114 | 118 | ||
115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |