diff options
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 161 |
1 files changed, 93 insertions, 68 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -3,31 +3,35 @@ | |||
3 | #include <iostream> | 3 | #include <iostream> |
4 | #include <cstdlib> | 4 | #include <cstdlib> |
5 | #include <algorithm> | 5 | #include <algorithm> |
6 | #include "malaprop.h" | ||
7 | |||
8 | std::string canonize(std::string f); | ||
6 | 9 | ||
7 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 10 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
8 | // We consider maxK to be fairly constant | 11 | // We consider maxK to be fairly constant |
9 | kgramstats::kgramstats(string corpus, int maxK) | 12 | kgramstats::kgramstats(std::string corpus, int maxK) |
10 | { | 13 | { |
11 | this->maxK = maxK; | 14 | this->maxK = maxK; |
12 | 15 | ||
13 | vector<string> tokens; | 16 | std::vector<std::string> tokens; |
14 | int start = 0; | 17 | size_t start = 0; |
15 | int end = 0; | 18 | int end = 0; |
16 | 19 | ||
17 | while (end != string::npos) | 20 | while (end != std::string::npos) |
18 | { | 21 | { |
19 | end = corpus.find(" ", start); | 22 | end = corpus.find(" ", start); |
20 | 23 | ||
21 | string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); | 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
22 | if (token.compare("")) | 25 | if (token.compare("")) |
23 | { | 26 | { |
27 | mstats.addWord(token); | ||
24 | tokens.push_back(token); | 28 | tokens.push_back(token); |
25 | } | 29 | } |
26 | 30 | ||
27 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 31 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
28 | } | 32 | } |
29 | 33 | ||
30 | map<kgram, map<string, token_data*>* > tstats; | 34 | std::map<kgram, std::map<std::string, token_data*>* > tstats; |
31 | bool newSentence = true; | 35 | bool newSentence = true; |
32 | bool newClause = false; | 36 | bool newClause = false; |
33 | for (int k=0; k<=maxK; k++) | 37 | for (int k=0; k<=maxK; k++) |
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
35 | for (int i=0; i<(tokens.size() - k); i++) | 39 | for (int i=0; i<(tokens.size() - k); i++) |
36 | { | 40 | { |
37 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 41 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
38 | transform(seq.begin(), seq.end(), seq.begin(), canonize); | 42 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); |
39 | string f = tokens[i+k]; | 43 | std::string f = tokens[i+k]; |
40 | string canonical = canonize(f); | 44 | std::string canonical = canonize(f); |
41 | 45 | ||
42 | if (tstats[seq] == NULL) | 46 | if (tstats[seq] == NULL) |
43 | { | 47 | { |
44 | tstats[seq] = new map<string, token_data*>(); | 48 | tstats[seq] = new std::map<std::string, token_data*>(); |
45 | } | 49 | } |
46 | 50 | ||
47 | if ((*tstats[seq])[canonical] == NULL) | 51 | if ((*tstats[seq])[canonical] == NULL) |
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
50 | } | 54 | } |
51 | 55 | ||
52 | token_data* td = tstats[seq]->at(canonical); | 56 | token_data* td = tstats[seq]->at(canonical); |
53 | td->token = new string(canonical); | 57 | td->token = new std::string(canonical); |
54 | td->all++; | 58 | td->all++; |
55 | 59 | ||
56 | if (newSentence) | 60 | if (newSentence) |
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
58 | kgram newKgram(1, "."); | 62 | kgram newKgram(1, "."); |
59 | if (tstats[newKgram] == NULL) | 63 | if (tstats[newKgram] == NULL) |
60 | { | 64 | { |
61 | tstats[newKgram] = new map<string, token_data*>(); | 65 | tstats[newKgram] = new std::map<std::string, token_data*>(); |
62 | } | 66 | } |
63 | 67 | ||
64 | (*tstats[newKgram])[canonical] = td; | 68 | (*tstats[newKgram])[canonical] = td; |
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
71 | kgram commaKgram(1, ","); | 75 | kgram commaKgram(1, ","); |
72 | if (tstats[commaKgram] == NULL) | 76 | if (tstats[commaKgram] == NULL) |
73 | { | 77 | { |
74 | tstats[commaKgram] = new map<string, token_data*>(); | 78 | tstats[commaKgram] = new std::map<std::string, token_data*>(); |
75 | } | 79 | } |
76 | 80 | ||
77 | (*tstats[commaKgram])[canonical] = td; | 81 | (*tstats[commaKgram])[canonical] = td; |
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
164 | } | 168 | } |
165 | } | 169 | } |
166 | 170 | ||
167 | stats = new map<kgram, map<int, token_data*>* >(); | 171 | stats = new std::map<kgram, std::map<int, token_data*>* >(); |
168 | for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) | 172 | for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) |
169 | { | 173 | { |
170 | kgram klist = it->first; | 174 | kgram klist = it->first; |
171 | map<string, token_data*>* probtable = it->second; | 175 | std::map<std::string, token_data*>* probtable = it->second; |
172 | map<int, token_data*>* distribution = new map<int, token_data*>(); | 176 | std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); |
173 | int max = 0; | 177 | int max = 0; |
174 | 178 | ||
175 | for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) | 179 | for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) |
176 | { | 180 | { |
177 | max += kt->second->all; | 181 | max += kt->second->all; |
178 | 182 | ||
@@ -187,17 +191,17 @@ void printKgram(kgram k) | |||
187 | { | 191 | { |
188 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 192 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
189 | { | 193 | { |
190 | cout << *it << " "; | 194 | std::cout << *it << " "; |
191 | } | 195 | } |
192 | } | 196 | } |
193 | 197 | ||
194 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 198 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
195 | vector<string> kgramstats::randomSentence(int n) | 199 | std::vector<std::string> kgramstats::randomSentence(int n) |
196 | { | 200 | { |
197 | vector<string> result; | 201 | std::vector<std::string> result; |
198 | kgram newKgram(1, "."); | 202 | kgram newKgram(1, "."); |
199 | kgram commaKgram(1, ","); | 203 | kgram commaKgram(1, ","); |
200 | list<string> cur = newKgram; | 204 | std::list<std::string> cur = newKgram; |
201 | int cuts = 0; | 205 | int cuts = 0; |
202 | 206 | ||
203 | for (int i=0; i<n; i++) | 207 | for (int i=0; i<n; i++) |
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n) | |||
221 | cuts++; | 225 | cuts++; |
222 | } | 226 | } |
223 | 227 | ||
224 | map<int, token_data*> distribution = *(*stats)[cur]; | 228 | std::map<int, token_data*> distribution = *(*stats)[cur]; |
225 | int max = distribution.rbegin()->first; | 229 | int max = distribution.rbegin()->first; |
226 | int r = rand() % max; | 230 | int r = rand() % max; |
227 | token_data* next = distribution.upper_bound(r)->second; | 231 | token_data* next = distribution.upper_bound(r)->second; |
228 | 232 | ||
229 | string nextToken(*(next->token)); | 233 | std::string nextToken(*(next->token)); |
230 | int casing = rand() % next->all; | 234 | int casing = rand() % next->all; |
231 | int period = rand() % next->all; | 235 | int period = rand() % next->all; |
232 | int startparen = rand() % next->all; | 236 | int startparen = rand() % next->all; |
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n) | |||
236 | int comma = rand() % next->all; | 240 | int comma = rand() % next->all; |
237 | if (casing < next->uppercase) | 241 | if (casing < next->uppercase) |
238 | { | 242 | { |
239 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
240 | } else if ((casing - next->uppercase) < next->titlecase) | 244 | } else if ((casing - next->uppercase) < next->titlecase) |
241 | { | 245 | { |
242 | nextToken[0] = toupper(nextToken[0]); | 246 | nextToken[0] = toupper(nextToken[0]); |
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n) | |||
246 | { | 250 | { |
247 | nextToken[0] = toupper(nextToken[0]); | 251 | nextToken[0] = toupper(nextToken[0]); |
248 | } | 252 | } |
249 | /* | 253 | |
250 | if (startquote < next->startquote) | 254 | bool mess = (rand() % 100) == 0; |
251 | { | 255 | if (mess) |
252 | nextToken = "\"" + nextToken; | ||
253 | } else if (startparen < next->startparen) | ||
254 | { | 256 | { |
255 | nextToken = "(" + nextToken; | 257 | nextToken = mstats.alternate(nextToken); |
256 | } | 258 | |
257 | 259 | if (startquote < next->startquote) | |
258 | if (period < next->period) | ||
259 | { | ||
260 | if (endquote < next->endquote) | ||
261 | { | 260 | { |
262 | nextToken += "\""; | 261 | nextToken = "\"" + nextToken; |
263 | } else if (endparen < next->endparen) | 262 | } else if (startparen < next->startparen) |
264 | { | 263 | { |
265 | nextToken += ")"; | 264 | nextToken = "(" + nextToken; |
266 | } | 265 | } |
266 | |||
267 | if (period < next->period) | ||
268 | { | ||
269 | if (endquote < next->endquote) | ||
270 | { | ||
271 | nextToken += "\""; | ||
272 | } else if (endparen < next->endparen) | ||
273 | { | ||
274 | nextToken += ")"; | ||
275 | } | ||
267 | 276 | ||
268 | int type = rand() % 6; | 277 | int type = rand() % 6; |
269 | 278 | ||
270 | if (type < 3) | 279 | if (type < 3) |
271 | { | 280 | { |
272 | nextToken += "."; | 281 | nextToken += "."; |
273 | } else if (type < 5) | 282 | } else if (type < 5) |
274 | { | 283 | { |
275 | nextToken += "!"; | 284 | nextToken += "!"; |
276 | } else { | 285 | } else { |
277 | nextToken += "?"; | 286 | nextToken += "?"; |
278 | } | 287 | } |
279 | } else if (comma < next->comma) | 288 | } else if (comma < next->comma) |
280 | { | ||
281 | if (endquote < next->endquote) | ||
282 | { | ||
283 | nextToken += "\""; | ||
284 | } else if (endparen < next->endparen) | ||
285 | { | 289 | { |
286 | nextToken += ")"; | 290 | if (endquote < next->endquote) |
287 | } | 291 | { |
292 | nextToken += "\""; | ||
293 | } else if (endparen < next->endparen) | ||
294 | { | ||
295 | nextToken += ")"; | ||
296 | } | ||
288 | 297 | ||
289 | nextToken += ","; | 298 | nextToken += ","; |
299 | } | ||
290 | } | 300 | } |
291 | */ | 301 | |
292 | if (cur.size() == maxK) | 302 | if (cur.size() == maxK) |
293 | { | 303 | { |
294 | cur.pop_front(); | 304 | cur.pop_front(); |
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n) | |||
297 | /* DEBUG */ | 307 | /* DEBUG */ |
298 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) | 308 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) |
299 | { | 309 | { |
300 | cout << *it << " "; | 310 | std::cout << *it << " "; |
301 | } | 311 | } |
302 | 312 | ||
303 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; | 313 | std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; |
314 | |||
315 | if (mess) | ||
316 | { | ||
317 | std::cout << " mala " << *(next->token); | ||
318 | } | ||
319 | |||
320 | std::cout << std::endl; | ||
304 | 321 | ||
305 | if ((cur == newKgram) || (cur == commaKgram)) | 322 | if ((cur == newKgram) || (cur == commaKgram)) |
306 | { | 323 | { |
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n) | |||
314 | { | 331 | { |
315 | cur = commaKgram; | 332 | cur = commaKgram; |
316 | } else { | 333 | } else { |
317 | cur.push_back(*(next->token)); | 334 | //if (mess && (rand() % 2 == 0)) |
335 | if (false) | ||
336 | { | ||
337 | // This doesn't work because sometimes the alternate token isn't actually present in the original corpus | ||
338 | cur.clear(); | ||
339 | cur.push_back(nextToken); | ||
340 | } else { | ||
341 | cur.push_back(*(next->token)); | ||
342 | } | ||
318 | } | 343 | } |
319 | 344 | ||
320 | result.push_back(nextToken); | 345 | result.push_back(nextToken); |
@@ -330,11 +355,11 @@ bool removeIf(char c) | |||
330 | 355 | ||
331 | std::string canonize(std::string f) | 356 | std::string canonize(std::string f) |
332 | { | 357 | { |
333 | string canonical(f); | 358 | std::string canonical(f); |
334 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 359 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); |
335 | 360 | ||
336 | string result; | 361 | std::string result; |
337 | remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 362 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
338 | 363 | ||
339 | return canonical; | 364 | return canonical; |
340 | } | 365 | } |