about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2015-12-30 22:01:37 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2015-12-30 22:01:37 -0500
commit0a5c6bd740aff9be53e7ef117e9e926fde3c289e (patch)
tree7a0a5c95433b5505b890c8a3176c863a5e802b8e /kgramstats.cpp
parent006c6502872cfc51eafd1df06ccb01c3c140a1ed (diff)
downloadrawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.tar.gz
rawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.tar.bz2
rawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.zip
guess what! the algorithm
this time it's a literal algorithm again
not canonizing away punctuation
newlines are actually considered new sentences now
we look for the end of a sentence and then start after that
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp87
1 files changed, 56 insertions, 31 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
22 end = corpus.find(" ", start); 22 end = corpus.find(" ", start);
23 23
24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25 if (token.compare("")) 25 if (token[token.length()-1] == '\n')
26 {
27 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))
28 {
29 token.insert(token.length()-1, ".");
30 }
31
32 token.resize(token.length()-1);
33 }
34
35 if (token.compare("") && token.compare("."))
26 { 36 {
27 mstats.addWord(token); 37 mstats.addWord(token);
28 tokens.push_back(token); 38 tokens.push_back(token);
@@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
34 std::map<kgram, std::map<std::string, token_data*>* > tstats; 44 std::map<kgram, std::map<std::string, token_data*>* > tstats;
35 bool newSentence = true; 45 bool newSentence = true;
36 bool newClause = false; 46 bool newClause = false;
37 for (int k=0; k<=maxK; k++) 47 for (int k=0; k<maxK; k++)
38 { 48 {
39 for (int i=0; i<(tokens.size() - k); i++) 49 for (int i=0; i<(tokens.size() - k); i++)
40 { 50 {
41 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 51 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
42 std::transform(seq.begin(), seq.end(), seq.begin(), canonize); 52 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
43 std::string f = tokens[i+k]; 53 std::string f = tokens[i+k];
44 std::string canonical = canonize(f); 54
55
56
57 std::string canonical = canonize(f);
45 58
46 if (tstats[seq] == NULL) 59 if (tstats[seq] == NULL)
47 { 60 {
@@ -57,7 +70,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
57 td->token = new std::string(canonical); 70 td->token = new std::string(canonical);
58 td->all++; 71 td->all++;
59 72
60 if (newSentence) 73 /*if (newSentence)
61 { 74 {
62 kgram newKgram(1, "."); 75 kgram newKgram(1, ".");
63 if (tstats[newKgram] == NULL) 76 if (tstats[newKgram] == NULL)
@@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
70 newSentence = false; 83 newSentence = false;
71 } 84 }
72 85
73 if (newClause || newSentence) 86 if (newClause)
74 { 87 {
75 kgram commaKgram(1, ","); 88 kgram commaKgram(1, ",");
76 if (tstats[commaKgram] == NULL) 89 if (tstats[commaKgram] == NULL)
@@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
156 } 169 }
157 } 170 }
158 } 171 }
159 } 172 }*/
160 173
161 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 174 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
162 { 175 {
@@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
165 { 178 {
166 td->titlecase++; 179 td->titlecase++;
167 } 180 }
181
182 /*if (k != 0)
183 {
184 if (newSentence)
185 {
186 i += k;
187 }
188
189 newSentence = false;
190 newClause = false;
191 }*/
168 } 192 }
169 } 193 }
170 194
@@ -201,16 +225,21 @@ std::vector<std::string> kgramstats::randomSentence(int n)
201 std::vector<std::string> result; 225 std::vector<std::string> result;
202 kgram newKgram(1, "."); 226 kgram newKgram(1, ".");
203 kgram commaKgram(1, ","); 227 kgram commaKgram(1, ",");
204 std::list<std::string> cur = newKgram; 228 std::list<std::string> cur;
205 int cuts = 0; 229 int cuts = 0;
206 230
207 for (int i=0; i<n; i++) 231 for (int i=0; i<n; i++)
208 { 232 {
209 /*if ((cur.size() > 0) && (cur != newKgram)) 233 if (cur.size() == maxK)
234 {
235 cur.pop_front();
236 }
237
238 if ((cur.size() > 0) && (cur != newKgram))
210 { 239 {
211 if (rand() % (maxK - cur.size() + 1) == 0) 240 if (rand() % (maxK - cur.size() + 1) == 0)
212 { 241 {
213 while (cur.size() > 1) 242 while (cur.size() > 2)
214 { 243 {
215 if ((rand() % (n)) < cuts) 244 if ((rand() % (n)) < cuts)
216 { 245 {
@@ -223,7 +252,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
223 } 252 }
224 253
225 cuts++; 254 cuts++;
226 }*/ 255 }
227 256
228 std::map<int, token_data*> distribution = *(*stats)[cur]; 257 std::map<int, token_data*> distribution = *(*stats)[cur];
229 int max = distribution.rbegin()->first; 258 int max = distribution.rbegin()->first;
@@ -232,12 +261,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
232 261
233 std::string nextToken(*(next->token)); 262 std::string nextToken(*(next->token));
234 int casing = rand() % next->all; 263 int casing = rand() % next->all;
235 int period = rand() % next->all; 264 /*int period = rand() % next->all;
236 int startparen = rand() % next->all; 265 int startparen = rand() % next->all;
237 int endparen = rand() % next->all; 266 int endparen = rand() % next->all;
238 int startquote = rand() % next->all; 267 int startquote = rand() % next->all;
239 int endquote = rand() % next->all; 268 int endquote = rand() % next->all;
240 int comma = rand() % next->all; 269 int comma = rand() % next->all;*/
270
271 bool mess = (rand() % 100) == 0;
272 if (mess)
273 {
274 nextToken = mstats.alternate(nextToken);
275 }
276
241 if (casing < next->uppercase) 277 if (casing < next->uppercase)
242 { 278 {
243 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 279 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
@@ -247,14 +283,8 @@ std::vector<std::string> kgramstats::randomSentence(int n)
247 { 283 {
248 nextToken[0] = toupper(nextToken[0]); 284 nextToken[0] = toupper(nextToken[0]);
249 } 285 }
250
251 bool mess = (rand() % 100) == 0;
252 if (mess)
253 {
254 nextToken = mstats.alternate(nextToken);
255 }
256 286
257 if (startquote < next->startquote) 287 /*if (startquote < next->startquote)
258 { 288 {
259 nextToken = "\"" + nextToken; 289 nextToken = "\"" + nextToken;
260 } else if (startparen < next->startparen) 290 } else if (startparen < next->startparen)
@@ -294,12 +324,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
294 } 324 }
295 325
296 nextToken += ","; 326 nextToken += ",";
297 } 327 }*/
298
299 if (cur.size() == maxK)
300 {
301 cur.pop_front();
302 }
303 328
304 /* DEBUG */ 329 /* DEBUG */
305 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 330 for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
@@ -316,18 +341,18 @@ std::vector<std::string> kgramstats::randomSentence(int n)
316 341
317 std::cout << std::endl; 342 std::cout << std::endl;
318 343
319 if ((cur == newKgram) || (cur == commaKgram)) 344 /*if ((cur == newKgram) || (cur == commaKgram))
320 { 345 {
321 cur.pop_front(); 346 cur.pop_front();
322 } 347 }
323 348
324 if ((period < next->period) && ((rand() % 3) == 0)) 349 if (period < next->period)// && ((rand() % 3) != 0))
325 { 350 {
326 cur = newKgram; 351 cur = newKgram;
327 } else if ((comma < next->comma) && ((rand() % 3) == 0)) 352 } else if ((comma < next->comma) && ((rand() % 3) == 0))
328 { 353 {
329 cur = commaKgram; 354 cur = commaKgram;
330 } else { 355 } else {*/
331 //if (mess && (rand() % 2 == 0)) 356 //if (mess && (rand() % 2 == 0))
332 if (false) 357 if (false)
333 { 358 {
@@ -337,7 +362,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
337 } else { 362 } else {
338 cur.push_back(*(next->token)); 363 cur.push_back(*(next->token));
339 } 364 }
340 } 365 //}
341 366
342 result.push_back(nextToken); 367 result.push_back(nextToken);
343 } 368 }
@@ -347,7 +372,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
347 372
348bool removeIf(char c) 373bool removeIf(char c)
349{ 374{
350 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); 375 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));
351} 376}
352 377
353std::string canonize(std::string f) 378std::string canonize(std::string f)
@@ -358,5 +383,5 @@ std::string canonize(std::string f)
358 std::string result; 383 std::string result;
359 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 384 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
360 385
361 return result; 386 return canonical;
362} 387}