about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2015-12-30 22:01:37 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2015-12-30 22:01:37 -0500
commit0a5c6bd740aff9be53e7ef117e9e926fde3c289e (patch)
tree7a0a5c95433b5505b890c8a3176c863a5e802b8e
parent006c6502872cfc51eafd1df06ccb01c3c140a1ed (diff)
downloadrawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.tar.gz
rawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.tar.bz2
rawr-ebooks-0a5c6bd740aff9be53e7ef117e9e926fde3c289e.zip
guess what! the algorithm
this time it's a literal algorithm again
not canonizing away punctuation
newlines are actually considered new sentences now
we look for the end of a sentence and then start after that
-rw-r--r--ebooks.cpp26
-rw-r--r--gen.cpp16
-rw-r--r--kgramstats.cpp87
3 files changed, 84 insertions, 45 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 6bbe25e..27591f4 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -24,11 +24,11 @@ int main(int argc, char** args)
24 std::string line; 24 std::string line;
25 while (getline(infile, line)) 25 while (getline(infile, line))
26 { 26 {
27 corpus += " " + line; 27 corpus += line + "\n ";
28 } 28 }
29 29
30 std::cout << "Preprocessing corpus..." << std::endl; 30 std::cout << "Preprocessing corpus..." << std::endl;
31 kgramstats* stats = new kgramstats(corpus, 3); 31 kgramstats* stats = new kgramstats(corpus, 4);
32 32
33 std::cout << "Preprocessing freevars..." << std::endl; 33 std::cout << "Preprocessing freevars..." << std::endl;
34 freevars* vars = new freevars(); 34 freevars* vars = new freevars();
@@ -38,20 +38,26 @@ int main(int argc, char** args)
38 std::cout << "Generating..." << std::endl; 38 std::cout << "Generating..." << std::endl;
39 for (;;) 39 for (;;)
40 { 40 {
41 std::vector<std::string> doc = stats->randomSentence(rand() % 25 + 5); 41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
42 std::string hi; 42 std::string hi;
43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
44 { 44 {
45 hi += vars->parse(*it) + " "; 45 hi += vars->parse(*it) + " ";
46 } 46 }
47 47
48 size_t lastperiod = hi.find_last_of("."); 48 size_t firstperiod = hi.find_first_of(".!?");
49 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 49 if (firstperiod != std::string::npos)
50 { 50 {
51 hi = hi.substr(0, lastperiod+1); 51 hi = hi.substr(firstperiod+2);
52 } 52 }
53 53
54 hi = hi.substr(0,140); 54 hi.resize(140);
55
56 size_t lastperiod = hi.find_last_of(".!?");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 {
59 hi = hi.substr(0, lastperiod+1);
60 }
55 61
56 twitCurl twitterObj; 62 twitCurl twitterObj;
57 std::string tmpStr, tmpStr2; 63 std::string tmpStr, tmpStr2;
diff --git a/gen.cpp b/gen.cpp index 3284ffa..7e47d45 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -38,11 +38,11 @@ int main(int argc, char** args)
38 std::string line; 38 std::string line;
39 while (getline(infile, line)) 39 while (getline(infile, line))
40 { 40 {
41 corpus += " " + line; 41 corpus += line + "\n ";
42 } 42 }
43 43
44 std::cout << "Preprocessing corpus..." << std::endl; 44 std::cout << "Preprocessing corpus..." << std::endl;
45 kgramstats* stats = new kgramstats(corpus, 3); 45 kgramstats* stats = new kgramstats(corpus, 4);
46 46
47 std::cout << "Preprocessing freevars..." << std::endl; 47 std::cout << "Preprocessing freevars..." << std::endl;
48 freevars* vars = new freevars(); 48 freevars* vars = new freevars();
@@ -52,14 +52,22 @@ int main(int argc, char** args)
52 std::cout << "Generating..." << std::endl; 52 std::cout << "Generating..." << std::endl;
53 for (;;) 53 for (;;)
54 { 54 {
55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); 55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45);
56 std::string hi; 56 std::string hi;
57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58 { 58 {
59 hi += vars->parse(*it) + " "; 59 hi += vars->parse(*it) + " ";
60 } 60 }
61
62 size_t firstperiod = hi.find_first_of(".!?");
63 if (firstperiod != std::string::npos)
64 {
65 hi = hi.substr(firstperiod+2);
66 }
67
68 hi.resize(140);
61 69
62 size_t lastperiod = hi.find_last_of("."); 70 size_t lastperiod = hi.find_last_of(".!?");
63 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 71 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
64 { 72 {
65 hi = hi.substr(0, lastperiod+1); 73 hi = hi.substr(0, lastperiod+1);
diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
22 end = corpus.find(" ", start); 22 end = corpus.find(" ", start);
23 23
24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25 if (token.compare("")) 25 if (token[token.length()-1] == '\n')
26 {
27 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))
28 {
29 token.insert(token.length()-1, ".");
30 }
31
32 token.resize(token.length()-1);
33 }
34
35 if (token.compare("") && token.compare("."))
26 { 36 {
27 mstats.addWord(token); 37 mstats.addWord(token);
28 tokens.push_back(token); 38 tokens.push_back(token);
@@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
34 std::map<kgram, std::map<std::string, token_data*>* > tstats; 44 std::map<kgram, std::map<std::string, token_data*>* > tstats;
35 bool newSentence = true; 45 bool newSentence = true;
36 bool newClause = false; 46 bool newClause = false;
37 for (int k=0; k<=maxK; k++) 47 for (int k=0; k<maxK; k++)
38 { 48 {
39 for (int i=0; i<(tokens.size() - k); i++) 49 for (int i=0; i<(tokens.size() - k); i++)
40 { 50 {
41 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 51 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
42 std::transform(seq.begin(), seq.end(), seq.begin(), canonize); 52 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
43 std::string f = tokens[i+k]; 53 std::string f = tokens[i+k];
44 std::string canonical = canonize(f); 54
55
56
57 std::string canonical = canonize(f);
45 58
46 if (tstats[seq] == NULL) 59 if (tstats[seq] == NULL)
47 { 60 {
@@ -57,7 +70,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
57 td->token = new std::string(canonical); 70 td->token = new std::string(canonical);
58 td->all++; 71 td->all++;
59 72
60 if (newSentence) 73 /*if (newSentence)
61 { 74 {
62 kgram newKgram(1, "."); 75 kgram newKgram(1, ".");
63 if (tstats[newKgram] == NULL) 76 if (tstats[newKgram] == NULL)
@@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
70 newSentence = false; 83 newSentence = false;
71 } 84 }
72 85
73 if (newClause || newSentence) 86 if (newClause)
74 { 87 {
75 kgram commaKgram(1, ","); 88 kgram commaKgram(1, ",");
76 if (tstats[commaKgram] == NULL) 89 if (tstats[commaKgram] == NULL)
@@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
156 } 169 }
157 } 170 }
158 } 171 }
159 } 172 }*/
160 173
161 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 174 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
162 { 175 {
@@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
165 { 178 {
166 td->titlecase++; 179 td->titlecase++;
167 } 180 }
181
182 /*if (k != 0)
183 {
184 if (newSentence)
185 {
186 i += k;
187 }
188
189 newSentence = false;
190 newClause = false;
191 }*/
168 } 192 }
169 } 193 }
170 194
@@ -201,16 +225,21 @@ std::vector<std::string> kgramstats::randomSentence(int n)
201 std::vector<std::string> result; 225 std::vector<std::string> result;
202 kgram newKgram(1, "."); 226 kgram newKgram(1, ".");
203 kgram commaKgram(1, ","); 227 kgram commaKgram(1, ",");
204 std::list<std::string> cur = newKgram; 228 std::list<std::string> cur;
205 int cuts = 0; 229 int cuts = 0;
206 230
207 for (int i=0; i<n; i++) 231 for (int i=0; i<n; i++)
208 { 232 {
209 /*if ((cur.size() > 0) && (cur != newKgram)) 233 if (cur.size() == maxK)
234 {
235 cur.pop_front();
236 }
237
238 if ((cur.size() > 0) && (cur != newKgram))
210 { 239 {
211 if (rand() % (maxK - cur.size() + 1) == 0) 240 if (rand() % (maxK - cur.size() + 1) == 0)
212 { 241 {
213 while (cur.size() > 1) 242 while (cur.size() > 2)
214 { 243 {
215 if ((rand() % (n)) < cuts) 244 if ((rand() % (n)) < cuts)
216 { 245 {
@@ -223,7 +252,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
223 } 252 }
224 253
225 cuts++; 254 cuts++;
226 }*/ 255 }
227 256
228 std::map<int, token_data*> distribution = *(*stats)[cur]; 257 std::map<int, token_data*> distribution = *(*stats)[cur];
229 int max = distribution.rbegin()->first; 258 int max = distribution.rbegin()->first;
@@ -232,12 +261,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
232 261
233 std::string nextToken(*(next->token)); 262 std::string nextToken(*(next->token));
234 int casing = rand() % next->all; 263 int casing = rand() % next->all;
235 int period = rand() % next->all; 264 /*int period = rand() % next->all;
236 int startparen = rand() % next->all; 265 int startparen = rand() % next->all;
237 int endparen = rand() % next->all; 266 int endparen = rand() % next->all;
238 int startquote = rand() % next->all; 267 int startquote = rand() % next->all;
239 int endquote = rand() % next->all; 268 int endquote = rand() % next->all;
240 int comma = rand() % next->all; 269 int comma = rand() % next->all;*/
270
271 bool mess = (rand() % 100) == 0;
272 if (mess)
273 {
274 nextToken = mstats.alternate(nextToken);
275 }
276
241 if (casing < next->uppercase) 277 if (casing < next->uppercase)
242 { 278 {
243 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 279 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
@@ -247,14 +283,8 @@ std::vector<std::string> kgramstats::randomSentence(int n)
247 { 283 {
248 nextToken[0] = toupper(nextToken[0]); 284 nextToken[0] = toupper(nextToken[0]);
249 } 285 }
250
251 bool mess = (rand() % 100) == 0;
252 if (mess)
253 {
254 nextToken = mstats.alternate(nextToken);
255 }
256 286
257 if (startquote < next->startquote) 287 /*if (startquote < next->startquote)
258 { 288 {
259 nextToken = "\"" + nextToken; 289 nextToken = "\"" + nextToken;
260 } else if (startparen < next->startparen) 290 } else if (startparen < next->startparen)
@@ -294,12 +324,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
294 } 324 }
295 325
296 nextToken += ","; 326 nextToken += ",";
297 } 327 }*/
298
299 if (cur.size() == maxK)
300 {
301 cur.pop_front();
302 }
303 328
304 /* DEBUG */ 329 /* DEBUG */
305 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 330 for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
@@ -316,18 +341,18 @@ std::vector<std::string> kgramstats::randomSentence(int n)
316 341
317 std::cout << std::endl; 342 std::cout << std::endl;
318 343
319 if ((cur == newKgram) || (cur == commaKgram)) 344 /*if ((cur == newKgram) || (cur == commaKgram))
320 { 345 {
321 cur.pop_front(); 346 cur.pop_front();
322 } 347 }
323 348
324 if ((period < next->period) && ((rand() % 3) == 0)) 349 if (period < next->period)// && ((rand() % 3) != 0))
325 { 350 {
326 cur = newKgram; 351 cur = newKgram;
327 } else if ((comma < next->comma) && ((rand() % 3) == 0)) 352 } else if ((comma < next->comma) && ((rand() % 3) == 0))
328 { 353 {
329 cur = commaKgram; 354 cur = commaKgram;
330 } else { 355 } else {*/
331 //if (mess && (rand() % 2 == 0)) 356 //if (mess && (rand() % 2 == 0))
332 if (false) 357 if (false)
333 { 358 {
@@ -337,7 +362,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
337 } else { 362 } else {
338 cur.push_back(*(next->token)); 363 cur.push_back(*(next->token));
339 } 364 }
340 } 365 //}
341 366
342 result.push_back(nextToken); 367 result.push_back(nextToken);
343 } 368 }
@@ -347,7 +372,7 @@ std::vector<std::string> kgramstats::randomSentence(int n)
347 372
348bool removeIf(char c) 373bool removeIf(char c)
349{ 374{
350 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); 375 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));
351} 376}
352 377
353std::string canonize(std::string f) 378std::string canonize(std::string f)
@@ -358,5 +383,5 @@ std::string canonize(std::string f)
358 std::string result; 383 std::string result;
359 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 384 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
360 385
361 return result; 386 return canonical;
362} 387}