about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
commit9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree9afb52740fe4f618105d014a816df26b36ed83f6 /kgramstats.cpp
parent0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
downloadrawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
Rewrote quite a bit of kgramstats
The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses).

The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp442
1 files changed, 165 insertions, 277 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0ec68a..c88d83c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -5,237 +5,176 @@
5#include <algorithm> 5#include <algorithm>
6#include "malaprop.h" 6#include "malaprop.h"
7 7
8query wildcardQuery(querytype_sentence);
9
8std::string canonize(std::string f); 10std::string canonize(std::string f);
9 11
10// runs in O(t^2) time where t is the number of tokens in the input corpus 12// runs in O(t^2) time where t is the number of tokens in the input corpus
11// We consider maxK to be fairly constant 13// We consider maxK to be fairly constant
12kgramstats::kgramstats(std::string corpus, int maxK) 14kgramstats::kgramstats(std::string corpus, int maxK)
13{ 15{
14 this->maxK = maxK; 16 this->maxK = maxK;
15 17
16 std::vector<std::string> tokens; 18 std::vector<std::string> tokens;
17 size_t start = 0; 19 size_t start = 0;
18 int end = 0; 20 int end = 0;
19 21
20 while (end != std::string::npos) 22 while (end != std::string::npos)
21 { 23 {
22 end = corpus.find(" ", start); 24 end = corpus.find(" ", start);
23 25
24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 26 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25 if (token[token.length()-1] == '\n') 27 if (token[token.length()-1] == '\n')
26 { 28 {
27 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) 29 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
28 { 30 {
29 token.insert(token.length()-1, "."); 31 token.insert(token.length()-1, ".");
30 } 32 }
31 33
32 token.resize(token.length()-1); 34 token.resize(token.length()-1);
33 } 35 }
34 36
35 if (token.compare("") && token.compare(".")) 37 if (token.compare("") && token.compare("."))
36 { 38 {
37 mstats.addWord(token); 39 mstats.addWord(token);
38 tokens.push_back(token); 40 tokens.push_back(token);
39 } 41 }
40 42
41 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 43 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
42 } 44 }
43 45
44 std::map<kgram, std::map<std::string, token_data*>* > tstats; 46 std::map<kgram, std::map<token, token_data> > tstats;
45 bool newSentence = true; 47 std::map<token, std::map<termstats, int> > tendings;
46 bool newClause = false; 48 for (int k=1; k<maxK; k++)
47 for (int k=0; k<maxK; k++) 49 {
48 { 50 for (int i=0; i<(tokens.size() - k); i++)
49 for (int i=0; i<(tokens.size() - k); i++) 51 {
50 { 52 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
51 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 53 kgram prefix;
52 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
53 std::string f = tokens[i+k];
54
55
56
57 std::string canonical = canonize(f);
58
59 if (tstats[seq] == NULL)
60 {
61 tstats[seq] = new std::map<std::string, token_data*>();
62 }
63
64 if ((*tstats[seq])[canonical] == NULL)
65 {
66 (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
67 }
68
69 token_data* td = tstats[seq]->at(canonical);
70 td->token = new std::string(canonical);
71 td->all++;
72 54
73 /*if (newSentence) 55 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
74 { 56 {
75 kgram newKgram(1, "."); 57 token word(canonize(*it));
76 if (tstats[newKgram] == NULL) 58
59 if (it->find_first_of(".?!,") != std::string::npos)
77 { 60 {
78 tstats[newKgram] = new std::map<std::string, token_data*>(); 61 word.terminating = true;
79 } 62 }
80 63
81 (*tstats[newKgram])[canonical] = td; 64 prefix.push_back(word);
82
83 newSentence = false;
84 } 65 }
85 66
86 if (newClause) 67 std::string f = tokens[i+k];
68 std::string canonical = canonize(f);
69
70 token word(canonical);
71 if (f.find_first_of(".?!,") != std::string::npos)
87 { 72 {
88 kgram commaKgram(1, ","); 73 word.terminating = true;
89 if (tstats[commaKgram] == NULL)
90 {
91 tstats[commaKgram] = new std::map<std::string, token_data*>();
92 }
93 74
94 (*tstats[commaKgram])[canonical] = td; 75 char terminator = f[f.find_last_of(".?!,")];
76 int occurrences = std::count(f.begin(), f.end(), terminator);
95 77
96 newClause = false; 78 tendings[word][termstats(terminator, occurrences)]++;
97 }
98
99 if ((f.length() > 0) && (f[f.length()-1] == '\n'))
100 {
101 td->period++;
102 newSentence = true;
103 f.resize(f.length()-1);
104 } 79 }
105 80
106 if (f.length() > 0) 81 token_data& td = tstats[prefix][word];
82 td.word = word;
83 td.all++;
84
85 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
107 { 86 {
108 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) 87 td.uppercase++;
109 { 88 } else if (isupper(f[0]))
110 if (!newSentence) 89 {
111 { 90 td.titlecase++;
112 td->period++;
113 newSentence = true;
114 }
115
116 f.resize(f.length()-1);
117 } else if (f[f.length()-1] == ',')
118 {
119 if (!newSentence)
120 {
121 td->comma++;
122 newClause = true;
123 }
124
125 f.resize(f.length()-1);
126 }
127 } 91 }
128 92
129 if (f.length() > 0) 93 if (prefix.front().word.terminating)
130 { 94 {
131 if (f[0] == '"') 95 prefix.front() = wildcardQuery;
132 {
133 td->startquote++;
134 }
135 96
136 if (f[0] == '(') 97 token_data& td2 = tstats[prefix][word];
98 td2.word = word;
99 td2.all++;
100
101 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
137 { 102 {
138 td->startparen++; 103 td2.uppercase++;
139 } 104 } else if (isupper(f[0]))
140
141 if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
142 { 105 {
143 if (f[f.length()-1] == '"') 106 td2.titlecase++;
144 {
145 td->endquote++;
146 } else if (f[f.length()-1] == ')')
147 {
148 td->endparen++;
149 }
150
151 f.resize(f.length()-1);
152
153 if (f.length() > 0)
154 {
155 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
156 {
157 if (!newSentence)
158 {
159 td->period++;
160 newSentence = true;
161 }
162 } else if (f[f.length()-1] == ',')
163 {
164 if (!newSentence && !newClause)
165 {
166 td->comma++;
167 newClause = true;
168 }
169 }
170 }
171 }
172 }*/
173
174 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
175 {
176 td->uppercase++;
177 } else if (isupper(f[0]))
178 {
179 td->titlecase++;
180 }
181
182 /*if (k != 0)
183 {
184 if (newSentence)
185 {
186 i += k;
187 } 107 }
188 108 }
189 newSentence = false; 109 }
190 newClause = false; 110 }
191 }*/
192 }
193 }
194 111
195 stats = new std::map<kgram, std::map<int, token_data*>* >(); 112 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
196 for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) 113 {
197 { 114 kgram klist = it->first;
198 kgram klist = it->first; 115 std::map<token, token_data>& probtable = it->second;
199 std::map<std::string, token_data*>* probtable = it->second; 116 std::map<int, token_data>& distribution = stats[klist];
200 std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); 117 int max = 0;
201 int max = 0;
202 118
203 for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) 119 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
204 { 120 {
205 max += kt->second->all; 121 max += kt->second.all;
206 122
207 (*distribution)[max] = kt->second; 123 distribution[max] = kt->second;
208 } 124 }
209 125 }
210 (*stats)[klist] = distribution; 126
211 } 127 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
128 {
129 token word = it->first;
130 std::map<termstats, int>& probtable = it->second;
131 std::map<int, termstats>& distribution = endings[word];
132 int max = 0;
133
134 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
135 {
136 max += kt->second;
137
138 distribution[max] = kt->first;
139 }
140 }
212} 141}
213 142
214void printKgram(kgram k) 143void printKgram(kgram k)
215{ 144{
216 for (kgram::iterator it = k.begin(); it != k.end(); it++) 145 for (kgram::iterator it = k.begin(); it != k.end(); it++)
217 { 146 {
218 std::cout << *it << " "; 147 query& q = *it;
219 } 148 if (q.type == querytype_sentence)
149 {
150 std::cout << "#.# ";
151 } else if (q.type == querytype_literal)
152 {
153 if (q.word.terminating)
154 {
155 std::cout << q.word.canon << ". ";
156 } else {
157 std::cout << q.word.canon << " ";
158 }
159 }
160 }
220} 161}
221 162
222// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 163// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
223std::vector<std::string> kgramstats::randomSentence(int n) 164std::vector<std::string> kgramstats::randomSentence(int n)
224{ 165{
225 std::vector<std::string> result; 166 std::vector<std::string> result;
226 kgram newKgram(1, "."); 167 kgram cur(1, wildcardQuery);
227 kgram commaKgram(1, ",");
228 std::list<std::string> cur;
229 int cuts = 0; 168 int cuts = 0;
230 169
231 for (int i=0; i<n; i++) 170 for (int i=0; i<n; i++)
232 { 171 {
233 if (cur.size() == maxK) 172 if (cur.size() == maxK)
234 { 173 {
235 cur.pop_front(); 174 cur.pop_front();
236 } 175 }
237 176
238 if ((cur.size() > 0) && (cur != newKgram)) 177 if (cur.size() > 0)
239 { 178 {
240 if (rand() % (maxK - cur.size() + 1) == 0) 179 if (rand() % (maxK - cur.size() + 1) == 0)
241 { 180 {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
253 192
254 cuts++; 193 cuts++;
255 } 194 }
195
196 // Gotta circumvent the last line of the input corpus
197 // https://twitter.com/starla4444/status/684222271339237376
198 if (stats.count(cur) == 0)
199 {
200 cur = kgram(1, wildcardQuery);
201 }
256 202
257 std::map<int, token_data*> distribution = *(*stats)[cur]; 203 std::map<int, token_data>& distribution = stats[cur];
258 int max = distribution.rbegin()->first; 204 int max = distribution.rbegin()->first;
259 int r = rand() % max; 205 int r = rand() % max;
260 token_data* next = distribution.upper_bound(r)->second; 206 token_data& next = distribution.upper_bound(r)->second;
261 207 std::string nextToken(next.word.canon);
262 std::string nextToken(*(next->token));
263 int casing = rand() % next->all;
264 /*int period = rand() % next->all;
265 int startparen = rand() % next->all;
266 int endparen = rand() % next->all;
267 int startquote = rand() % next->all;
268 int endquote = rand() % next->all;
269 int comma = rand() % next->all;*/
270 208
271 bool mess = (rand() % 100) == 0; 209 bool mess = (rand() % 100) == 0;
272 if (mess) 210 if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
274 nextToken = mstats.alternate(nextToken); 212 nextToken = mstats.alternate(nextToken);
275 } 213 }
276 214
277 if (casing < next->uppercase) 215 // Determine the casing of the next token. We randomly make the token all
278 { 216 // caps based on the markov chain. Otherwise, we check if the previous
279 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 217 // token is the end of a sentence (terminating token or a wildcard query).
280 } 218 int casing = rand() % next.all;
281 219 if (casing < next.uppercase)
282 if ((cur == newKgram) && (rand() % 15 > 0)) 220 {
221 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
222 } else if ((((cur.rbegin()->type == querytype_sentence)
223 || ((cur.rbegin()->type == querytype_literal)
224 && (cur.rbegin()->word.terminating)))
225 && (rand() % 2 > 0))
226 || (casing - next.uppercase < next.titlecase))
283 { 227 {
284 nextToken[0] = toupper(nextToken[0]); 228 nextToken[0] = toupper(nextToken[0]);
285 } 229 }
286 230
287 /*if (startquote < next->startquote) 231 if (next.word.terminating)
288 {
289 nextToken = "\"" + nextToken;
290 } else if (startparen < next->startparen)
291 { 232 {
292 nextToken = "(" + nextToken; 233 std::map<int, termstats>& ending = endings[next.word];
234 int emax = ending.rbegin()->first;
235 int er = rand() % emax;
236 termstats& nextend = ending.upper_bound(er)->second;
237
238 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
293 } 239 }
294
295 if (period < next->period)
296 {
297 if (endquote < next->endquote)
298 {
299 nextToken += "\"";
300 } else if (endparen < next->endparen)
301 {
302 nextToken += ")";
303 }
304
305 int type = rand() % 6;
306
307 if (type < 3)
308 {
309 nextToken += ".";
310 } else if (type < 5)
311 {
312 nextToken += "!";
313 } else {
314 nextToken += "?";
315 }
316 } else if (comma < next->comma)
317 {
318 if (endquote < next->endquote)
319 {
320 nextToken += "\"";
321 } else if (endparen < next->endparen)
322 {
323 nextToken += ")";
324 }
325
326 nextToken += ",";
327 }*/
328 240
329 /* DEBUG */ 241 /* DEBUG */
330 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 242 printKgram(cur);
331 {
332 std::cout << *it << " ";
333 }
334 243
335 std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; 244 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
336 245
337 if (mess) 246 if (mess)
338 { 247 {
339 std::cout << " mala " << *(next->token); 248 std::cout << " mala " << next.word.canon;
340 } 249 }
341 250
342 std::cout << std::endl; 251 std::cout << std::endl;
343 252
344 /*if ((cur == newKgram) || (cur == commaKgram)) 253 cur.push_back(next.word);
345 {
346 cur.pop_front();
347 }
348
349 if (period < next->period)// && ((rand() % 3) != 0))
350 {
351 cur = newKgram;
352 } else if ((comma < next->comma) && ((rand() % 3) == 0))
353 {
354 cur = commaKgram;
355 } else {*/
356 //if (mess && (rand() % 2 == 0))
357 if (false)
358 {
359 // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
360 cur.clear();
361 cur.push_back(nextToken);
362 } else {
363 cur.push_back(*(next->token));
364 }
365 //}
366 254
367 result.push_back(nextToken); 255 result.push_back(nextToken);
368 } 256 }
369 257
370 return result; 258 return result;
371} 259}
372 260
373bool removeIf(char c) 261bool removeIf(char c)
374{ 262{
375 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); 263 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
376} 264}
377 265
378std::string canonize(std::string f) 266std::string canonize(std::string f)
379{ 267{
380 std::string canonical(f); 268 std::string canonical(f);
381 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 269 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
382 270
383 std::string result; 271 std::string result;
384 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 272 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
385 273
386 return canonical; 274 return result;
387} 275}