diff options
-rw-r--r-- | ebooks.cpp | 26 | ||||
-rw-r--r-- | gen.cpp | 16 | ||||
-rw-r--r-- | kgramstats.cpp | 87 |
3 files changed, 84 insertions, 45 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 6bbe25e..27591f4 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -24,11 +24,11 @@ int main(int argc, char** args) | |||
24 | std::string line; | 24 | std::string line; |
25 | while (getline(infile, line)) | 25 | while (getline(infile, line)) |
26 | { | 26 | { |
27 | corpus += " " + line; | 27 | corpus += line + "\n "; |
28 | } | 28 | } |
29 | 29 | ||
30 | std::cout << "Preprocessing corpus..." << std::endl; | 30 | std::cout << "Preprocessing corpus..." << std::endl; |
31 | kgramstats* stats = new kgramstats(corpus, 3); | 31 | kgramstats* stats = new kgramstats(corpus, 4); |
32 | 32 | ||
33 | std::cout << "Preprocessing freevars..." << std::endl; | 33 | std::cout << "Preprocessing freevars..." << std::endl; |
34 | freevars* vars = new freevars(); | 34 | freevars* vars = new freevars(); |
@@ -38,20 +38,26 @@ int main(int argc, char** args) | |||
38 | std::cout << "Generating..." << std::endl; | 38 | std::cout << "Generating..." << std::endl; |
39 | for (;;) | 39 | for (;;) |
40 | { | 40 | { |
41 | std::vector<std::string> doc = stats->randomSentence(rand() % 25 + 5); | 41 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); |
42 | std::string hi; | 42 | std::string hi; |
43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
44 | { | 44 | { |
45 | hi += vars->parse(*it) + " "; | 45 | hi += vars->parse(*it) + " "; |
46 | } | 46 | } |
47 | 47 | ||
48 | size_t lastperiod = hi.find_last_of("."); | 48 | size_t firstperiod = hi.find_first_of(".!?"); |
49 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 49 | if (firstperiod != std::string::npos) |
50 | { | 50 | { |
51 | hi = hi.substr(0, lastperiod+1); | 51 | hi = hi.substr(firstperiod+2); |
52 | } | 52 | } |
53 | 53 | ||
54 | hi = hi.substr(0,140); | 54 | hi.resize(140); |
55 | |||
56 | size_t lastperiod = hi.find_last_of(".!?"); | ||
57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
58 | { | ||
59 | hi = hi.substr(0, lastperiod+1); | ||
60 | } | ||
55 | 61 | ||
56 | twitCurl twitterObj; | 62 | twitCurl twitterObj; |
57 | std::string tmpStr, tmpStr2; | 63 | std::string tmpStr, tmpStr2; |
diff --git a/gen.cpp b/gen.cpp index 3284ffa..7e47d45 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -38,11 +38,11 @@ int main(int argc, char** args) | |||
38 | std::string line; | 38 | std::string line; |
39 | while (getline(infile, line)) | 39 | while (getline(infile, line)) |
40 | { | 40 | { |
41 | corpus += " " + line; | 41 | corpus += line + "\n "; |
42 | } | 42 | } |
43 | 43 | ||
44 | std::cout << "Preprocessing corpus..." << std::endl; | 44 | std::cout << "Preprocessing corpus..." << std::endl; |
45 | kgramstats* stats = new kgramstats(corpus, 3); | 45 | kgramstats* stats = new kgramstats(corpus, 4); |
46 | 46 | ||
47 | std::cout << "Preprocessing freevars..." << std::endl; | 47 | std::cout << "Preprocessing freevars..." << std::endl; |
48 | freevars* vars = new freevars(); | 48 | freevars* vars = new freevars(); |
@@ -52,14 +52,22 @@ int main(int argc, char** args) | |||
52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
53 | for (;;) | 53 | for (;;) |
54 | { | 54 | { |
55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45); |
56 | std::string hi; | 56 | std::string hi; |
57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
58 | { | 58 | { |
59 | hi += vars->parse(*it) + " "; | 59 | hi += vars->parse(*it) + " "; |
60 | } | 60 | } |
61 | |||
62 | size_t firstperiod = hi.find_first_of(".!?"); | ||
63 | if (firstperiod != std::string::npos) | ||
64 | { | ||
65 | hi = hi.substr(firstperiod+2); | ||
66 | } | ||
67 | |||
68 | hi.resize(140); | ||
61 | 69 | ||
62 | size_t lastperiod = hi.find_last_of("."); | 70 | size_t lastperiod = hi.find_last_of(".!?"); |
63 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 71 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) |
64 | { | 72 | { |
65 | hi = hi.substr(0, lastperiod+1); | 73 | hi = hi.substr(0, lastperiod+1); |
diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
22 | end = corpus.find(" ", start); | 22 | end = corpus.find(" ", start); |
23 | 23 | ||
24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
25 | if (token.compare("")) | 25 | if (token[token.length()-1] == '\n') |
26 | { | ||
27 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) | ||
28 | { | ||
29 | token.insert(token.length()-1, "."); | ||
30 | } | ||
31 | |||
32 | token.resize(token.length()-1); | ||
33 | } | ||
34 | |||
35 | if (token.compare("") && token.compare(".")) | ||
26 | { | 36 | { |
27 | mstats.addWord(token); | 37 | mstats.addWord(token); |
28 | tokens.push_back(token); | 38 | tokens.push_back(token); |
@@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
34 | std::map<kgram, std::map<std::string, token_data*>* > tstats; | 44 | std::map<kgram, std::map<std::string, token_data*>* > tstats; |
35 | bool newSentence = true; | 45 | bool newSentence = true; |
36 | bool newClause = false; | 46 | bool newClause = false; |
37 | for (int k=0; k<=maxK; k++) | 47 | for (int k=0; k<maxK; k++) |
38 | { | 48 | { |
39 | for (int i=0; i<(tokens.size() - k); i++) | 49 | for (int i=0; i<(tokens.size() - k); i++) |
40 | { | 50 | { |
41 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 51 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
42 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); | 52 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); |
43 | std::string f = tokens[i+k]; | 53 | std::string f = tokens[i+k]; |
44 | std::string canonical = canonize(f); | 54 | |
55 | |||
56 | |||
57 | std::string canonical = canonize(f); | ||
45 | 58 | ||
46 | if (tstats[seq] == NULL) | 59 | if (tstats[seq] == NULL) |
47 | { | 60 | { |
@@ -57,7 +70,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
57 | td->token = new std::string(canonical); | 70 | td->token = new std::string(canonical); |
58 | td->all++; | 71 | td->all++; |
59 | 72 | ||
60 | if (newSentence) | 73 | /*if (newSentence) |
61 | { | 74 | { |
62 | kgram newKgram(1, "."); | 75 | kgram newKgram(1, "."); |
63 | if (tstats[newKgram] == NULL) | 76 | if (tstats[newKgram] == NULL) |
@@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
70 | newSentence = false; | 83 | newSentence = false; |
71 | } | 84 | } |
72 | 85 | ||
73 | if (newClause || newSentence) | 86 | if (newClause) |
74 | { | 87 | { |
75 | kgram commaKgram(1, ","); | 88 | kgram commaKgram(1, ","); |
76 | if (tstats[commaKgram] == NULL) | 89 | if (tstats[commaKgram] == NULL) |
@@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
156 | } | 169 | } |
157 | } | 170 | } |
158 | } | 171 | } |
159 | } | 172 | }*/ |
160 | 173 | ||
161 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 174 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) |
162 | { | 175 | { |
@@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
165 | { | 178 | { |
166 | td->titlecase++; | 179 | td->titlecase++; |
167 | } | 180 | } |
181 | |||
182 | /*if (k != 0) | ||
183 | { | ||
184 | if (newSentence) | ||
185 | { | ||
186 | i += k; | ||
187 | } | ||
188 | |||
189 | newSentence = false; | ||
190 | newClause = false; | ||
191 | }*/ | ||
168 | } | 192 | } |
169 | } | 193 | } |
170 | 194 | ||
@@ -201,16 +225,21 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
201 | std::vector<std::string> result; | 225 | std::vector<std::string> result; |
202 | kgram newKgram(1, "."); | 226 | kgram newKgram(1, "."); |
203 | kgram commaKgram(1, ","); | 227 | kgram commaKgram(1, ","); |
204 | std::list<std::string> cur = newKgram; | 228 | std::list<std::string> cur; |
205 | int cuts = 0; | 229 | int cuts = 0; |
206 | 230 | ||
207 | for (int i=0; i<n; i++) | 231 | for (int i=0; i<n; i++) |
208 | { | 232 | { |
209 | /*if ((cur.size() > 0) && (cur != newKgram)) | 233 | if (cur.size() == maxK) |
234 | { | ||
235 | cur.pop_front(); | ||
236 | } | ||
237 | |||
238 | if ((cur.size() > 0) && (cur != newKgram)) | ||
210 | { | 239 | { |
211 | if (rand() % (maxK - cur.size() + 1) == 0) | 240 | if (rand() % (maxK - cur.size() + 1) == 0) |
212 | { | 241 | { |
213 | while (cur.size() > 1) | 242 | while (cur.size() > 2) |
214 | { | 243 | { |
215 | if ((rand() % (n)) < cuts) | 244 | if ((rand() % (n)) < cuts) |
216 | { | 245 | { |
@@ -223,7 +252,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
223 | } | 252 | } |
224 | 253 | ||
225 | cuts++; | 254 | cuts++; |
226 | }*/ | 255 | } |
227 | 256 | ||
228 | std::map<int, token_data*> distribution = *(*stats)[cur]; | 257 | std::map<int, token_data*> distribution = *(*stats)[cur]; |
229 | int max = distribution.rbegin()->first; | 258 | int max = distribution.rbegin()->first; |
@@ -232,12 +261,19 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
232 | 261 | ||
233 | std::string nextToken(*(next->token)); | 262 | std::string nextToken(*(next->token)); |
234 | int casing = rand() % next->all; | 263 | int casing = rand() % next->all; |
235 | int period = rand() % next->all; | 264 | /*int period = rand() % next->all; |
236 | int startparen = rand() % next->all; | 265 | int startparen = rand() % next->all; |
237 | int endparen = rand() % next->all; | 266 | int endparen = rand() % next->all; |
238 | int startquote = rand() % next->all; | 267 | int startquote = rand() % next->all; |
239 | int endquote = rand() % next->all; | 268 | int endquote = rand() % next->all; |
240 | int comma = rand() % next->all; | 269 | int comma = rand() % next->all;*/ |
270 | |||
271 | bool mess = (rand() % 100) == 0; | ||
272 | if (mess) | ||
273 | { | ||
274 | nextToken = mstats.alternate(nextToken); | ||
275 | } | ||
276 | |||
241 | if (casing < next->uppercase) | 277 | if (casing < next->uppercase) |
242 | { | 278 | { |
243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 279 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
@@ -247,14 +283,8 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
247 | { | 283 | { |
248 | nextToken[0] = toupper(nextToken[0]); | 284 | nextToken[0] = toupper(nextToken[0]); |
249 | } | 285 | } |
250 | |||
251 | bool mess = (rand() % 100) == 0; | ||
252 | if (mess) | ||
253 | { | ||
254 | nextToken = mstats.alternate(nextToken); | ||
255 | } | ||
256 | 286 | ||
257 | if (startquote < next->startquote) | 287 | /*if (startquote < next->startquote) |
258 | { | 288 | { |
259 | nextToken = "\"" + nextToken; | 289 | nextToken = "\"" + nextToken; |
260 | } else if (startparen < next->startparen) | 290 | } else if (startparen < next->startparen) |
@@ -294,12 +324,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
294 | } | 324 | } |
295 | 325 | ||
296 | nextToken += ","; | 326 | nextToken += ","; |
297 | } | 327 | }*/ |
298 | |||
299 | if (cur.size() == maxK) | ||
300 | { | ||
301 | cur.pop_front(); | ||
302 | } | ||
303 | 328 | ||
304 | /* DEBUG */ | 329 | /* DEBUG */ |
305 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) | 330 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) |
@@ -316,18 +341,18 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
316 | 341 | ||
317 | std::cout << std::endl; | 342 | std::cout << std::endl; |
318 | 343 | ||
319 | if ((cur == newKgram) || (cur == commaKgram)) | 344 | /*if ((cur == newKgram) || (cur == commaKgram)) |
320 | { | 345 | { |
321 | cur.pop_front(); | 346 | cur.pop_front(); |
322 | } | 347 | } |
323 | 348 | ||
324 | if ((period < next->period) && ((rand() % 3) == 0)) | 349 | if (period < next->period)// && ((rand() % 3) != 0)) |
325 | { | 350 | { |
326 | cur = newKgram; | 351 | cur = newKgram; |
327 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | 352 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) |
328 | { | 353 | { |
329 | cur = commaKgram; | 354 | cur = commaKgram; |
330 | } else { | 355 | } else {*/ |
331 | //if (mess && (rand() % 2 == 0)) | 356 | //if (mess && (rand() % 2 == 0)) |
332 | if (false) | 357 | if (false) |
333 | { | 358 | { |
@@ -337,7 +362,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
337 | } else { | 362 | } else { |
338 | cur.push_back(*(next->token)); | 363 | cur.push_back(*(next->token)); |
339 | } | 364 | } |
340 | } | 365 | //} |
341 | 366 | ||
342 | result.push_back(nextToken); | 367 | result.push_back(nextToken); |
343 | } | 368 | } |
@@ -347,7 +372,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
347 | 372 | ||
348 | bool removeIf(char c) | 373 | bool removeIf(char c) |
349 | { | 374 | { |
350 | return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); | 375 | return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); |
351 | } | 376 | } |
352 | 377 | ||
353 | std::string canonize(std::string f) | 378 | std::string canonize(std::string f) |
@@ -358,5 +383,5 @@ std::string canonize(std::string f) | |||
358 | std::string result; | 383 | std::string result; |
359 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 384 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
360 | 385 | ||
361 | return result; | 386 | return canonical; |
362 | } | 387 | } |