about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2015-11-22 18:49:58 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2015-11-22 18:49:58 -0500
commit01746a0e03267b6c082b58436c1370567f7cb7c5 (patch)
treee3cfeadb97f93858f326f57958bff4675cd8f9ed /kgramstats.cpp
parent294fe00911c6ee0dd9853df7612dcdbd63425c05 (diff)
downloadrawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.gz
rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.bz2
rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.zip
Added malapropisms
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp161
1 files changed, 93 insertions, 68 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -3,31 +3,35 @@
3#include <iostream> 3#include <iostream>
4#include <cstdlib> 4#include <cstdlib>
5#include <algorithm> 5#include <algorithm>
6#include "malaprop.h"
7
8std::string canonize(std::string f);
6 9
7// runs in O(t^2) time where t is the number of tokens in the input corpus 10// runs in O(t^2) time where t is the number of tokens in the input corpus
8// We consider maxK to be fairly constant 11// We consider maxK to be fairly constant
9kgramstats::kgramstats(string corpus, int maxK) 12kgramstats::kgramstats(std::string corpus, int maxK)
10{ 13{
11 this->maxK = maxK; 14 this->maxK = maxK;
12 15
13 vector<string> tokens; 16 std::vector<std::string> tokens;
14 int start = 0; 17 size_t start = 0;
15 int end = 0; 18 int end = 0;
16 19
17 while (end != string::npos) 20 while (end != std::string::npos)
18 { 21 {
19 end = corpus.find(" ", start); 22 end = corpus.find(" ", start);
20 23
21 string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); 24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
22 if (token.compare("")) 25 if (token.compare(""))
23 { 26 {
27 mstats.addWord(token);
24 tokens.push_back(token); 28 tokens.push_back(token);
25 } 29 }
26 30
27 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 31 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
28 } 32 }
29 33
30 map<kgram, map<string, token_data*>* > tstats; 34 std::map<kgram, std::map<std::string, token_data*>* > tstats;
31 bool newSentence = true; 35 bool newSentence = true;
32 bool newClause = false; 36 bool newClause = false;
33 for (int k=0; k<=maxK; k++) 37 for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
35 for (int i=0; i<(tokens.size() - k); i++) 39 for (int i=0; i<(tokens.size() - k); i++)
36 { 40 {
37 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 41 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
38 transform(seq.begin(), seq.end(), seq.begin(), canonize); 42 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
39 string f = tokens[i+k]; 43 std::string f = tokens[i+k];
40 string canonical = canonize(f); 44 std::string canonical = canonize(f);
41 45
42 if (tstats[seq] == NULL) 46 if (tstats[seq] == NULL)
43 { 47 {
44 tstats[seq] = new map<string, token_data*>(); 48 tstats[seq] = new std::map<std::string, token_data*>();
45 } 49 }
46 50
47 if ((*tstats[seq])[canonical] == NULL) 51 if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
50 } 54 }
51 55
52 token_data* td = tstats[seq]->at(canonical); 56 token_data* td = tstats[seq]->at(canonical);
53 td->token = new string(canonical); 57 td->token = new std::string(canonical);
54 td->all++; 58 td->all++;
55 59
56 if (newSentence) 60 if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
58 kgram newKgram(1, "."); 62 kgram newKgram(1, ".");
59 if (tstats[newKgram] == NULL) 63 if (tstats[newKgram] == NULL)
60 { 64 {
61 tstats[newKgram] = new map<string, token_data*>(); 65 tstats[newKgram] = new std::map<std::string, token_data*>();
62 } 66 }
63 67
64 (*tstats[newKgram])[canonical] = td; 68 (*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
71 kgram commaKgram(1, ","); 75 kgram commaKgram(1, ",");
72 if (tstats[commaKgram] == NULL) 76 if (tstats[commaKgram] == NULL)
73 { 77 {
74 tstats[commaKgram] = new map<string, token_data*>(); 78 tstats[commaKgram] = new std::map<std::string, token_data*>();
75 } 79 }
76 80
77 (*tstats[commaKgram])[canonical] = td; 81 (*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
164 } 168 }
165 } 169 }
166 170
167 stats = new map<kgram, map<int, token_data*>* >(); 171 stats = new std::map<kgram, std::map<int, token_data*>* >();
168 for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) 172 for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
169 { 173 {
170 kgram klist = it->first; 174 kgram klist = it->first;
171 map<string, token_data*>* probtable = it->second; 175 std::map<std::string, token_data*>* probtable = it->second;
172 map<int, token_data*>* distribution = new map<int, token_data*>(); 176 std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
173 int max = 0; 177 int max = 0;
174 178
175 for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) 179 for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
176 { 180 {
177 max += kt->second->all; 181 max += kt->second->all;
178 182
@@ -187,17 +191,17 @@ void printKgram(kgram k)
187{ 191{
188 for (kgram::iterator it = k.begin(); it != k.end(); it++) 192 for (kgram::iterator it = k.begin(); it != k.end(); it++)
189 { 193 {
190 cout << *it << " "; 194 std::cout << *it << " ";
191 } 195 }
192} 196}
193 197
194// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 198// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
195vector<string> kgramstats::randomSentence(int n) 199std::vector<std::string> kgramstats::randomSentence(int n)
196{ 200{
197 vector<string> result; 201 std::vector<std::string> result;
198 kgram newKgram(1, "."); 202 kgram newKgram(1, ".");
199 kgram commaKgram(1, ","); 203 kgram commaKgram(1, ",");
200 list<string> cur = newKgram; 204 std::list<std::string> cur = newKgram;
201 int cuts = 0; 205 int cuts = 0;
202 206
203 for (int i=0; i<n; i++) 207 for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
221 cuts++; 225 cuts++;
222 } 226 }
223 227
224 map<int, token_data*> distribution = *(*stats)[cur]; 228 std::map<int, token_data*> distribution = *(*stats)[cur];
225 int max = distribution.rbegin()->first; 229 int max = distribution.rbegin()->first;
226 int r = rand() % max; 230 int r = rand() % max;
227 token_data* next = distribution.upper_bound(r)->second; 231 token_data* next = distribution.upper_bound(r)->second;
228 232
229 string nextToken(*(next->token)); 233 std::string nextToken(*(next->token));
230 int casing = rand() % next->all; 234 int casing = rand() % next->all;
231 int period = rand() % next->all; 235 int period = rand() % next->all;
232 int startparen = rand() % next->all; 236 int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
236 int comma = rand() % next->all; 240 int comma = rand() % next->all;
237 if (casing < next->uppercase) 241 if (casing < next->uppercase)
238 { 242 {
239 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 243 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
240 } else if ((casing - next->uppercase) < next->titlecase) 244 } else if ((casing - next->uppercase) < next->titlecase)
241 { 245 {
242 nextToken[0] = toupper(nextToken[0]); 246 nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
246 { 250 {
247 nextToken[0] = toupper(nextToken[0]); 251 nextToken[0] = toupper(nextToken[0]);
248 } 252 }
249 /* 253
250 if (startquote < next->startquote) 254 bool mess = (rand() % 100) == 0;
251 { 255 if (mess)
252 nextToken = "\"" + nextToken;
253 } else if (startparen < next->startparen)
254 { 256 {
255 nextToken = "(" + nextToken; 257 nextToken = mstats.alternate(nextToken);
256 } 258
257 259 if (startquote < next->startquote)
258 if (period < next->period)
259 {
260 if (endquote < next->endquote)
261 { 260 {
262 nextToken += "\""; 261 nextToken = "\"" + nextToken;
263 } else if (endparen < next->endparen) 262 } else if (startparen < next->startparen)
264 { 263 {
265 nextToken += ")"; 264 nextToken = "(" + nextToken;
266 } 265 }
266
267 if (period < next->period)
268 {
269 if (endquote < next->endquote)
270 {
271 nextToken += "\"";
272 } else if (endparen < next->endparen)
273 {
274 nextToken += ")";
275 }
267 276
268 int type = rand() % 6; 277 int type = rand() % 6;
269 278
270 if (type < 3) 279 if (type < 3)
271 { 280 {
272 nextToken += "."; 281 nextToken += ".";
273 } else if (type < 5) 282 } else if (type < 5)
274 { 283 {
275 nextToken += "!"; 284 nextToken += "!";
276 } else { 285 } else {
277 nextToken += "?"; 286 nextToken += "?";
278 } 287 }
279 } else if (comma < next->comma) 288 } else if (comma < next->comma)
280 {
281 if (endquote < next->endquote)
282 {
283 nextToken += "\"";
284 } else if (endparen < next->endparen)
285 { 289 {
286 nextToken += ")"; 290 if (endquote < next->endquote)
287 } 291 {
292 nextToken += "\"";
293 } else if (endparen < next->endparen)
294 {
295 nextToken += ")";
296 }
288 297
289 nextToken += ","; 298 nextToken += ",";
299 }
290 } 300 }
291*/ 301
292 if (cur.size() == maxK) 302 if (cur.size() == maxK)
293 { 303 {
294 cur.pop_front(); 304 cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
297 /* DEBUG */ 307 /* DEBUG */
298 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 308 for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
299 { 309 {
300 cout << *it << " "; 310 std::cout << *it << " ";
301 } 311 }
302 312
303 cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; 313 std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
314
315 if (mess)
316 {
317 std::cout << " mala " << *(next->token);
318 }
319
320 std::cout << std::endl;
304 321
305 if ((cur == newKgram) || (cur == commaKgram)) 322 if ((cur == newKgram) || (cur == commaKgram))
306 { 323 {
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
314 { 331 {
315 cur = commaKgram; 332 cur = commaKgram;
316 } else { 333 } else {
317 cur.push_back(*(next->token)); 334 //if (mess && (rand() % 2 == 0))
335 if (false)
336 {
337 // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
338 cur.clear();
339 cur.push_back(nextToken);
340 } else {
341 cur.push_back(*(next->token));
342 }
318 } 343 }
319 344
320 result.push_back(nextToken); 345 result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
330 355
331std::string canonize(std::string f) 356std::string canonize(std::string f)
332{ 357{
333 string canonical(f); 358 std::string canonical(f);
334 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 359 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
335 360
336 string result; 361 std::string result;
337 remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 362 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
338 363
339 return canonical; 364 return canonical;
340} 365}