about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp453
1 files changed, 277 insertions, 176 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37#include <iostream> 37#include <iostream>
38#include <cstdlib> 38#include <cstdlib>
39#include <algorithm> 39#include <algorithm>
40#include "malaprop.h" 40#include <set>
41#include <stack>
41 42
42query wildcardQuery(querytype_sentence); 43query wildcardQuery {querytype::sentence};
43 44word blank_word {""};
44std::string canonize(std::string f);
45
46token token_from_string(std::string in)
47{
48 if (in[0] == '#')
49 {
50 token word(tokentype_hashtag);
51
52 if (in.find_first_of(".?!,") != std::string::npos)
53 {
54 word.terminating = true;
55 }
56
57 return word;
58 } else {
59 token word(canonize(in));
60
61 if (in.find_first_of(".?!,") != std::string::npos)
62 {
63 word.terminating = true;
64 }
65
66 return word;
67 }
68}
69 45
70// runs in O(t^2) time where t is the number of tokens in the input corpus 46// runs in O(t^2) time where t is the number of tokens in the input corpus
71// We consider maxK to be fairly constant 47// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73{ 49{
74 this->maxK = maxK; 50 this->maxK = maxK;
75 51
76 std::vector<std::string> tokens; 52 std::vector<token> tokens;
77 size_t start = 0; 53 size_t start = 0;
78 int end = 0; 54 int end = 0;
79 std::set<std::string> thashtags; 55 std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82 { 58 {
83 end = corpus.find(" ", start); 59 end = corpus.find(" ", start);
84 60
85 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86 if (token[token.length()-1] == '\n') 62 if (t.compare("") && t.compare("."))
87 { 63 {
88 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) 64 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
67 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
68 });
69
70 word& w = ([&] () -> word& {
71 // Hashtag freevar
72 if (canonical[0] == '#')
73 {
74 thashtags.insert(canonical);
75 canonical = "#hashtag";
76
77 return hashtags;
78 }
79
80 // Basically any other word
81 if (words.count(canonical) == 0)
82 {
83 words.emplace(canonical, canonical);
84 }
85
86 word& tw = words.at(canonical);
87 tw.forms.add(canonical);
88
89 return tw;
90 })();
91
92 token tk(w);
93 tk.raw = t;
94
95 for (char c : t)
89 { 96 {
90 token.insert(token.length()-1, "."); 97 if (c == '*')
98 {
99 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
100 } else if (c == '[')
101 {
102 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
103 } else if (c == '(')
104 {
105 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
106 } else if (c == '"')
107 {
108 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
109 } else {
110 break;
111 }
91 } 112 }
92
93 token.resize(token.length()-1);
94 }
95
96 if (token.compare("") && token.compare("."))
97 {
98 mstats.addWord(token);
99 tokens.push_back(token);
100 113
101 if (token[0] == '#') 114 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
115 if (backtrack != t.length())
102 { 116 {
103 thashtags.insert(canonize(token)); 117 std::string ending = t.substr(backtrack);
118 std::string suffix;
119
120 for (char c : ending)
121 {
122 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
123 {
124 suffix += c;
125
126 continue;
127 } else if (c == '\n')
128 {
129 // At least the end is coming
130 if (suffix.empty())
131 {
132 suffix = ".";
133 }
134
135 break;
136 }
137
138 parentype pt = ([&] {
139 switch (c)
140 {
141 case ']': return parentype::square_bracket;
142 case ')': return parentype::paren;
143 case '*': return parentype::asterisk;
144 case '"': return parentype::quote;
145 }
146 })();
147
148 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
149 {
150 tk.delimiters[{pt, doublestatus::opening}]--;
151 tk.delimiters[{pt, doublestatus::both}]++;
152 } else {
153 tk.delimiters[{pt, doublestatus::closing}]++;
154 }
155 }
156
157 if (suffix == ",")
158 {
159 tk.suffix = suffixtype::comma;
160 } else if (!suffix.empty()) {
161 tk.suffix = suffixtype::terminating;
162
163 w.terms.add(suffix);
164 }
104 } 165 }
166
167 tokens.push_back(tk);
105 } 168 }
106 169
107 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 170 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108 } 171 }
109 172
110 for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) 173 // Time to condense the distribution stuff for the words
174 for (auto& it : words)
111 { 175 {
112 hashtags.push_back(*it); 176 it.second.forms.compile();
177 it.second.terms.compile();
113 } 178 }
114 179
180 // Hashtag freevar is not frequency distributed
181 for (auto& it : thashtags)
182 {
183 hashtags.forms.add(it);
184 }
185
186 hashtags.forms.compile();
187 hashtags.terms.compile();
188
189 // kgram distribution
115 std::map<kgram, std::map<token, token_data> > tstats; 190 std::map<kgram, std::map<token, token_data> > tstats;
116 std::map<token, std::map<termstats, int> > tendings;
117 for (int k=1; k<maxK; k++) 191 for (int k=1; k<maxK; k++)
118 { 192 {
119 for (int i=0; i<(tokens.size() - k); i++) 193 for (int i=0; i<(tokens.size() - k); i++)
120 { 194 {
121 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); 195 kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122 kgram prefix; 196 token f = tokens[i+k];
123 197
124 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) 198 if (tstats[prefix].count(f) == 0)
125 {
126 prefix.push_back(token_from_string(*it));
127 }
128
129 std::string f = tokens[i+k];
130 std::string canonical = canonize(f);
131
132 token word(token_from_string(canonical));
133 if (f.find_first_of(".?!,") != std::string::npos)
134 { 199 {
135 word.terminating = true; 200 tstats[prefix].emplace(f, f);
136
137 char terminator = f[f.find_last_of(".?!,")];
138 int occurrences = std::count(f.begin(), f.end(), terminator);
139
140 tendings[word][termstats(terminator, occurrences)]++;
141 } 201 }
142 202
143 token_data& td = tstats[prefix][word]; 203 token_data& td = tstats[prefix].at(f);
144 td.word = word;
145 td.all++; 204 td.all++;
146 205
147 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 206 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148 { 207 {
149 td.uppercase++; 208 td.uppercase++;
150 } else if (isupper(f[0])) 209 } else if (isupper(f.raw[0]))
151 { 210 {
152 td.titlecase++; 211 td.titlecase++;
153 } 212 }
154 213
155 if (prefix.front().word.terminating) 214 kgram term_prefix;
215 bool changed = false;
216 std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
217 if (q.tok.suffix == suffixtype::terminating)
218 {
219 changed = true;
220
221 return wildcardQuery;
222 } else {
223 return q;
224 }
225 });
226
227 if (changed)
156 { 228 {
157 prefix.front() = wildcardQuery; 229 if (tstats[term_prefix].count(f) == 0)
230 {
231 tstats[term_prefix].emplace(f, f);
232 }
158 233
159 token_data& td2 = tstats[prefix][word]; 234 token_data& td2 = tstats[term_prefix].at(f);
160 td2.word = word;
161 td2.all++; 235 td2.all++;
162 236
163 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 237 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164 { 238 {
165 td2.uppercase++; 239 td2.uppercase++;
166 } else if (isupper(f[0])) 240 } else if (isupper(f.raw[0]))
167 { 241 {
168 td2.titlecase++; 242 td2.titlecase++;
169 } 243 }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 245 }
172 } 246 }
173 247
174 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) 248 // Condense the kgram distribution
249 for (auto& it : tstats)
175 { 250 {
176 kgram klist = it->first; 251 kgram klist = it.first;
177 std::map<token, token_data>& probtable = it->second; 252 auto& probtable = it.second;
178 std::map<int, token_data>& distribution = stats[klist]; 253 auto& distribution = stats[klist];
179 int max = 0; 254 int max = 0;
180 255
181 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) 256 for (auto& kt : probtable)
182 { 257 {
183 max += kt->second.all; 258 max += kt.second.all;
184 259
185 distribution[max] = kt->second; 260 distribution.emplace(max, kt.second);
186 }
187 }
188
189 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190 {
191 token word = it->first;
192 std::map<termstats, int>& probtable = it->second;
193 std::map<int, termstats>& distribution = endings[word];
194 int max = 0;
195
196 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197 {
198 max += kt->second;
199
200 distribution[max] = kt->first;
201 } 261 }
202 } 262 }
203} 263}
204 264
205void printKgram(kgram k) 265void printKgram(kgram k)
206{ 266{
207 for (kgram::iterator it = k.begin(); it != k.end(); it++) 267 for (auto& q : k)
208 { 268 {
209 query& q = *it; 269 if (q.type == querytype::sentence)
210 if (q.type == querytype_sentence)
211 { 270 {
212 std::cout << "#.# "; 271 std::cout << "#.# ";
213 } else if (q.type == querytype_literal) 272 } else if (q.type == querytype::literal)
214 { 273 {
215 if (q.word.type == tokentype_hashtag) 274 if (q.tok.suffix == suffixtype::terminating)
216 { 275 {
217 if (q.word.terminating) 276 std::cout << q.tok.w.canon << ". ";
218 { 277 } else if (q.tok.suffix == suffixtype::comma)
219 std::cout << "#hashtag. ";
220 } else {
221 std::cout << "#hashtag ";
222 }
223 } else if (q.word.type == tokentype_literal)
224 { 278 {
225 if (q.word.terminating) 279 std::cout << q.tok.w.canon << ", ";
226 { 280 } else {
227 std::cout << q.word.canon << ". "; 281 std::cout << q.tok.w.canon << " ";
228 } else {
229 std::cout << q.word.canon << " ";
230 }
231 } 282 }
232 } 283 }
233 } 284 }
234} 285}
235 286
236// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 287// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237std::vector<std::string> kgramstats::randomSentence(int n) 288std::string kgramstats::randomSentence(int n)
238{ 289{
239 std::vector<std::string> result; 290 std::string result;
240 kgram cur(1, wildcardQuery); 291 kgram cur(1, wildcardQuery);
241 int cuts = 0; 292 int cuts = 0;
293 std::stack<parentype> open_delimiters;
242 294
243 for (int i=0; i<n; i++) 295 for (int i=0; i<n; i++)
244 { 296 {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273 cur = kgram(1, wildcardQuery); 325 cur = kgram(1, wildcardQuery);
274 } 326 }
275 327
276 std::map<int, token_data>& distribution = stats[cur]; 328 auto& distribution = stats[cur];
277 int max = distribution.rbegin()->first; 329 int max = distribution.rbegin()->first;
278 int r = rand() % max; 330 int r = rand() % max;
279 token_data& next = distribution.upper_bound(r)->second; 331 token_data& next = distribution.upper_bound(r)->second;
280 std::string nextToken; 332 std::string nextToken = next.tok.w.forms.next();
281 bool mess = false; 333
282 334 // Determine the casing of the next token. We randomly make the token all
283 if (next.word.type == tokentype_literal) 335 // caps based on the markov chain. Otherwise, we check if the previous
336 // token is the end of a sentence (terminating token or a wildcard query).
337 int casing = rand() % next.all;
338 if (casing < next.uppercase)
339 {
340 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
341 } else if ((((cur.rbegin()->type == querytype::sentence)
342 || ((cur.rbegin()->type == querytype::literal)
343 && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
344 && (rand() % 2 > 0))
345 || (casing - next.uppercase < next.titlecase))
284 { 346 {
285 nextToken = next.word.canon; 347 nextToken[0] = toupper(nextToken[0]);
348 }
286 349
287 mess = (rand() % 100) == 0; 350 // Delimiters
288 if (mess) 351 for (auto& dt : next.tok.delimiters)
352 {
353 if (dt.first.status == doublestatus::both)
289 { 354 {
290 nextToken = mstats.alternate(nextToken); 355 switch (dt.first.type)
291 } 356 {
292 357 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293 // Determine the casing of the next token. We randomly make the token all 358 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294 // caps based on the markov chain. Otherwise, we check if the previous 359 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
295 // token is the end of a sentence (terminating token or a wildcard query). 360 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296 int casing = rand() % next.all; 361 }
297 if (casing < next.uppercase) 362 } else if (dt.first.status == doublestatus::opening)
298 { 363 {
299 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 364 for (int i=0; i<dt.second; i++)
300 } else if ((((cur.rbegin()->type == querytype_sentence) 365 {
301 || ((cur.rbegin()->type == querytype_literal) 366 open_delimiters.push(dt.first.type);
302 && (cur.rbegin()->word.terminating))) 367 }
303 && (rand() % 2 > 0)) 368
304 || (casing - next.uppercase < next.titlecase)) 369 switch (dt.first.type)
370 {
371 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
372 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
373 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
374 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
375 }
376 } else if (dt.first.status == doublestatus::closing)
305 { 377 {
306 nextToken[0] = toupper(nextToken[0]); 378 for (int i=0; i<dt.second; i++)
379 {
380 while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
381 {
382 switch (open_delimiters.top())
383 {
384 case parentype::paren: nextToken.append(")"); break;
385 case parentype::square_bracket: nextToken.append("]"); break;
386 case parentype::asterisk: nextToken.append("*"); break;
387 case parentype::quote: nextToken.append("\""); break;
388 }
389
390 open_delimiters.pop();
391 }
392
393 if (open_delimiters.empty())
394 {
395 switch (dt.first.type)
396 {
397 case parentype::paren: result = "(" + result; break;
398 case parentype::square_bracket: result = "[" + result; break;
399 case parentype::asterisk: result = "*" + result; break;
400 case parentype::quote: result = "\"" + result; break;
401 }
402 }
403
404 switch (dt.first.type)
405 {
406 case parentype::paren: nextToken.append(")"); break;
407 case parentype::square_bracket: nextToken.append("]"); break;
408 case parentype::asterisk: nextToken.append("*"); break;
409 case parentype::quote: nextToken.append("\""); break;
410 }
411 }
307 } 412 }
308 } else if (next.word.type == tokentype_hashtag)
309 {
310 int rhash = rand() % hashtags.size();
311 nextToken = hashtags[rhash];
312 } 413 }
313 414
314 if (next.word.terminating) 415 // Terminators
416 if (next.tok.suffix == suffixtype::terminating)
315 { 417 {
316 std::map<int, termstats>& ending = endings[next.word]; 418 nextToken.append(next.tok.w.terms.next());
317 int emax = ending.rbegin()->first; 419 } else if (next.tok.suffix == suffixtype::comma)
318 int er = rand() % emax; 420 {
319 termstats& nextend = ending.upper_bound(er)->second; 421 nextToken.append(",");
320
321 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322 } 422 }
323 423
324 /* DEBUG */ 424 /* DEBUG */
325 printKgram(cur); 425 printKgram(cur);
426 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
427
428 cur.push_back(next.tok);
326 429
327 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; 430 result.append(nextToken + " ");
328 431
329 if (mess) 432 if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330 { 433 {
331 std::cout << " mala " << next.word.canon; 434 break;
332 } 435 }
333
334 std::cout << std::endl;
335
336 cur.push_back(next.word);
337
338 result.push_back(nextToken);
339 } 436 }
340
341 return result;
342}
343
344bool removeIf(char c)
345{
346 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
347}
348
349std::string canonize(std::string f)
350{
351 std::string canonical(f);
352 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353 437
354 std::string result; 438 // Remove the trailing space
355 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 439 if (result.back() == ' ')
440 {
441 result.pop_back();
442 }
443
444 // Close any open delimiters
445 while (!open_delimiters.empty())
446 {
447 switch (open_delimiters.top())
448 {
449 case parentype::paren: result.append(")"); break;
450 case parentype::square_bracket: result.append("]"); break;
451 case parentype::asterisk: result.append("*"); break;
452 case parentype::quote: result.append("\""); break;
453 }
454
455 open_delimiters.pop();
456 }
356 457
357 return result; 458 return result;
358} 459}