diff options
| -rw-r--r-- | ebooks.cpp | 26 | ||||
| -rw-r--r-- | gen.cpp | 16 | ||||
| -rw-r--r-- | kgramstats.cpp | 87 |
3 files changed, 84 insertions, 45 deletions
| diff --git a/ebooks.cpp b/ebooks.cpp index 6bbe25e..27591f4 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -24,11 +24,11 @@ int main(int argc, char** args) | |||
| 24 | std::string line; | 24 | std::string line; |
| 25 | while (getline(infile, line)) | 25 | while (getline(infile, line)) |
| 26 | { | 26 | { |
| 27 | corpus += " " + line; | 27 | corpus += line + "\n "; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | std::cout << "Preprocessing corpus..." << std::endl; | 30 | std::cout << "Preprocessing corpus..." << std::endl; |
| 31 | kgramstats* stats = new kgramstats(corpus, 3); | 31 | kgramstats* stats = new kgramstats(corpus, 4); |
| 32 | 32 | ||
| 33 | std::cout << "Preprocessing freevars..." << std::endl; | 33 | std::cout << "Preprocessing freevars..." << std::endl; |
| 34 | freevars* vars = new freevars(); | 34 | freevars* vars = new freevars(); |
| @@ -38,20 +38,26 @@ int main(int argc, char** args) | |||
| 38 | std::cout << "Generating..." << std::endl; | 38 | std::cout << "Generating..." << std::endl; |
| 39 | for (;;) | 39 | for (;;) |
| 40 | { | 40 | { |
| 41 | std::vector<std::string> doc = stats->randomSentence(rand() % 25 + 5); | 41 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); |
| 42 | std::string hi; | 42 | std::string hi; |
| 43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
| 44 | { | 44 | { |
| 45 | hi += vars->parse(*it) + " "; | 45 | hi += vars->parse(*it) + " "; |
| 46 | } | 46 | } |
| 47 | 47 | ||
| 48 | size_t lastperiod = hi.find_last_of("."); | 48 | size_t firstperiod = hi.find_first_of(".!?"); |
| 49 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 49 | if (firstperiod != std::string::npos) |
| 50 | { | 50 | { |
| 51 | hi = hi.substr(0, lastperiod+1); | 51 | hi = hi.substr(firstperiod+2); |
| 52 | } | 52 | } |
| 53 | 53 | ||
| 54 | hi = hi.substr(0,140); | 54 | hi.resize(140); |
| 55 | |||
| 56 | size_t lastperiod = hi.find_last_of(".!?"); | ||
| 57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
| 58 | { | ||
| 59 | hi = hi.substr(0, lastperiod+1); | ||
| 60 | } | ||
| 55 | 61 | ||
| 56 | twitCurl twitterObj; | 62 | twitCurl twitterObj; |
| 57 | std::string tmpStr, tmpStr2; | 63 | std::string tmpStr, tmpStr2; |
| diff --git a/gen.cpp b/gen.cpp index 3284ffa..7e47d45 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -38,11 +38,11 @@ int main(int argc, char** args) | |||
| 38 | std::string line; | 38 | std::string line; |
| 39 | while (getline(infile, line)) | 39 | while (getline(infile, line)) |
| 40 | { | 40 | { |
| 41 | corpus += " " + line; | 41 | corpus += line + "\n "; |
| 42 | } | 42 | } |
| 43 | 43 | ||
| 44 | std::cout << "Preprocessing corpus..." << std::endl; | 44 | std::cout << "Preprocessing corpus..." << std::endl; |
| 45 | kgramstats* stats = new kgramstats(corpus, 3); | 45 | kgramstats* stats = new kgramstats(corpus, 4); |
| 46 | 46 | ||
| 47 | std::cout << "Preprocessing freevars..." << std::endl; | 47 | std::cout << "Preprocessing freevars..." << std::endl; |
| 48 | freevars* vars = new freevars(); | 48 | freevars* vars = new freevars(); |
| @@ -52,14 +52,22 @@ int main(int argc, char** args) | |||
| 52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
| 53 | for (;;) | 53 | for (;;) |
| 54 | { | 54 | { |
| 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45); |
| 56 | std::string hi; | 56 | std::string hi; |
| 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
| 58 | { | 58 | { |
| 59 | hi += vars->parse(*it) + " "; | 59 | hi += vars->parse(*it) + " "; |
| 60 | } | 60 | } |
| 61 | |||
| 62 | size_t firstperiod = hi.find_first_of(".!?"); | ||
| 63 | if (firstperiod != std::string::npos) | ||
| 64 | { | ||
| 65 | hi = hi.substr(firstperiod+2); | ||
| 66 | } | ||
| 67 | |||
| 68 | hi.resize(140); | ||
| 61 | 69 | ||
| 62 | size_t lastperiod = hi.find_last_of("."); | 70 | size_t lastperiod = hi.find_last_of(".!?"); |
| 63 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 71 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) |
| 64 | { | 72 | { |
| 65 | hi = hi.substr(0, lastperiod+1); | 73 | hi = hi.substr(0, lastperiod+1); |
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 22 | end = corpus.find(" ", start); | 22 | end = corpus.find(" ", start); |
| 23 | 23 | ||
| 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
| 25 | if (token.compare("")) | 25 | if (token[token.length()-1] == '\n') |
| 26 | { | ||
| 27 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) | ||
| 28 | { | ||
| 29 | token.insert(token.length()-1, "."); | ||
| 30 | } | ||
| 31 | |||
| 32 | token.resize(token.length()-1); | ||
| 33 | } | ||
| 34 | |||
| 35 | if (token.compare("") && token.compare(".")) | ||
| 26 | { | 36 | { |
| 27 | mstats.addWord(token); | 37 | mstats.addWord(token); |
| 28 | tokens.push_back(token); | 38 | tokens.push_back(token); |
| @@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 34 | std::map<kgram, std::map<std::string, token_data*>* > tstats; | 44 | std::map<kgram, std::map<std::string, token_data*>* > tstats; |
| 35 | bool newSentence = true; | 45 | bool newSentence = true; |
| 36 | bool newClause = false; | 46 | bool newClause = false; |
| 37 | for (int k=0; k<=maxK; k++) | 47 | for (int k=0; k<maxK; k++) |
| 38 | { | 48 | { |
| 39 | for (int i=0; i<(tokens.size() - k); i++) | 49 | for (int i=0; i<(tokens.size() - k); i++) |
| 40 | { | 50 | { |
| 41 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 51 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
| 42 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); | 52 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); |
| 43 | std::string f = tokens[i+k]; | 53 | std::string f = tokens[i+k]; |
| 44 | std::string canonical = canonize(f); | 54 | |
| 55 | |||
| 56 | |||
| 57 | std::string canonical = canonize(f); | ||
| 45 | 58 | ||
| 46 | if (tstats[seq] == NULL) | 59 | if (tstats[seq] == NULL) |
| 47 | { | 60 | { |
| @@ -57,7 +70,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 57 | td->token = new std::string(canonical); | 70 | td->token = new std::string(canonical); |
| 58 | td->all++; | 71 | td->all++; |
| 59 | 72 | ||
| 60 | if (newSentence) | 73 | /*if (newSentence) |
| 61 | { | 74 | { |
| 62 | kgram newKgram(1, "."); | 75 | kgram newKgram(1, "."); |
| 63 | if (tstats[newKgram] == NULL) | 76 | if (tstats[newKgram] == NULL) |
| @@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 70 | newSentence = false; | 83 | newSentence = false; |
| 71 | } | 84 | } |
| 72 | 85 | ||
| 73 | if (newClause || newSentence) | 86 | if (newClause) |
| 74 | { | 87 | { |
| 75 | kgram commaKgram(1, ","); | 88 | kgram commaKgram(1, ","); |
| 76 | if (tstats[commaKgram] == NULL) | 89 | if (tstats[commaKgram] == NULL) |
| @@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 156 | } | 169 | } |
| 157 | } | 170 | } |
| 158 | } | 171 | } |
| 159 | } | 172 | }*/ |
| 160 | 173 | ||
| 161 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 174 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) |
| 162 | { | 175 | { |
| @@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 165 | { | 178 | { |
| 166 | td->titlecase++; | 179 | td->titlecase++; |
| 167 | } | 180 | } |
| 181 | |||
| 182 | /*if (k != 0) | ||
| 183 | { | ||
| 184 | if (newSentence) | ||
| 185 | { | ||
| 186 | i += k; | ||
| 187 | } | ||
| 188 | |||
| 189 | newSentence = false; | ||
| 190 | newClause = false; | ||
| 191 | }*/ | ||
| 168 | } | 192 | } |
| 169 | } | 193 | } |
| 170 | 194 | ||
| @@ -201,16 +225,21 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 201 | std::vector<std::string> result; | 225 | std::vector<std::string> result; |
| 202 | kgram newKgram(1, "."); | 226 | kgram newKgram(1, "."); |
| 203 | kgram commaKgram(1, ","); | 227 | kgram commaKgram(1, ","); |
| 204 | std::list<std::string> cur = newKgram; | 228 | std::list<std::string> cur; |
| 205 | int cuts = 0; | 229 | int cuts = 0; |
| 206 | 230 | ||
| 207 | for (int i=0; i<n; i++) | 231 | for (int i=0; i<n; i++) |
| 208 | { | 232 | { |
| 209 | /*if ((cur.size() > 0) && (cur != newKgram)) | 233 | if (cur.size() == maxK) |
| 234 | { | ||
| 235 | cur.pop_front(); | ||
| 236 | } | ||
| 237 | |||
| 238 | if ((cur.size() > 0) && (cur != newKgram)) | ||
| 210 | { | 239 | { |
| 211 | if (rand() % (maxK - cur.size() + 1) == 0) | 240 | if (rand() % (maxK - cur.size() + 1) == 0) |
| 212 | { | 241 | { |
| 213 | while (cur.size() > 1) | 242 | while (cur.size() > 2) |
| 214 | { | 243 | { |
| 215 | if ((rand() % (n)) < cuts) | 244 | if ((rand() % (n)) < cuts) |
| 216 | { | 245 | { |
| @@ -223,7 +252,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 223 | } | 252 | } |
| 224 | 253 | ||
| 225 | cuts++; | 254 | cuts++; |
| 226 | }*/ | 255 | } |
| 227 | 256 | ||
| 228 | std::map<int, token_data*> distribution = *(*stats)[cur]; | 257 | std::map<int, token_data*> distribution = *(*stats)[cur]; |
| 229 | int max = distribution.rbegin()->first; | 258 | int max = distribution.rbegin()->first; |
| @@ -232,12 +261,19 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 232 | 261 | ||
| 233 | std::string nextToken(*(next->token)); | 262 | std::string nextToken(*(next->token)); |
| 234 | int casing = rand() % next->all; | 263 | int casing = rand() % next->all; |
| 235 | int period = rand() % next->all; | 264 | /*int period = rand() % next->all; |
| 236 | int startparen = rand() % next->all; | 265 | int startparen = rand() % next->all; |
| 237 | int endparen = rand() % next->all; | 266 | int endparen = rand() % next->all; |
| 238 | int startquote = rand() % next->all; | 267 | int startquote = rand() % next->all; |
| 239 | int endquote = rand() % next->all; | 268 | int endquote = rand() % next->all; |
| 240 | int comma = rand() % next->all; | 269 | int comma = rand() % next->all;*/ |
| 270 | |||
| 271 | bool mess = (rand() % 100) == 0; | ||
| 272 | if (mess) | ||
| 273 | { | ||
| 274 | nextToken = mstats.alternate(nextToken); | ||
| 275 | } | ||
| 276 | |||
| 241 | if (casing < next->uppercase) | 277 | if (casing < next->uppercase) |
| 242 | { | 278 | { |
| 243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 279 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
| @@ -247,14 +283,8 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 247 | { | 283 | { |
| 248 | nextToken[0] = toupper(nextToken[0]); | 284 | nextToken[0] = toupper(nextToken[0]); |
| 249 | } | 285 | } |
| 250 | |||
| 251 | bool mess = (rand() % 100) == 0; | ||
| 252 | if (mess) | ||
| 253 | { | ||
| 254 | nextToken = mstats.alternate(nextToken); | ||
| 255 | } | ||
| 256 | 286 | ||
| 257 | if (startquote < next->startquote) | 287 | /*if (startquote < next->startquote) |
| 258 | { | 288 | { |
| 259 | nextToken = "\"" + nextToken; | 289 | nextToken = "\"" + nextToken; |
| 260 | } else if (startparen < next->startparen) | 290 | } else if (startparen < next->startparen) |
| @@ -294,12 +324,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 294 | } | 324 | } |
| 295 | 325 | ||
| 296 | nextToken += ","; | 326 | nextToken += ","; |
| 297 | } | 327 | }*/ |
| 298 | |||
| 299 | if (cur.size() == maxK) | ||
| 300 | { | ||
| 301 | cur.pop_front(); | ||
| 302 | } | ||
| 303 | 328 | ||
| 304 | /* DEBUG */ | 329 | /* DEBUG */ |
| 305 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) | 330 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) |
| @@ -316,18 +341,18 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 316 | 341 | ||
| 317 | std::cout << std::endl; | 342 | std::cout << std::endl; |
| 318 | 343 | ||
| 319 | if ((cur == newKgram) || (cur == commaKgram)) | 344 | /*if ((cur == newKgram) || (cur == commaKgram)) |
| 320 | { | 345 | { |
| 321 | cur.pop_front(); | 346 | cur.pop_front(); |
| 322 | } | 347 | } |
| 323 | 348 | ||
| 324 | if ((period < next->period) && ((rand() % 3) == 0)) | 349 | if (period < next->period)// && ((rand() % 3) != 0)) |
| 325 | { | 350 | { |
| 326 | cur = newKgram; | 351 | cur = newKgram; |
| 327 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | 352 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) |
| 328 | { | 353 | { |
| 329 | cur = commaKgram; | 354 | cur = commaKgram; |
| 330 | } else { | 355 | } else {*/ |
| 331 | //if (mess && (rand() % 2 == 0)) | 356 | //if (mess && (rand() % 2 == 0)) |
| 332 | if (false) | 357 | if (false) |
| 333 | { | 358 | { |
| @@ -337,7 +362,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 337 | } else { | 362 | } else { |
| 338 | cur.push_back(*(next->token)); | 363 | cur.push_back(*(next->token)); |
| 339 | } | 364 | } |
| 340 | } | 365 | //} |
| 341 | 366 | ||
| 342 | result.push_back(nextToken); | 367 | result.push_back(nextToken); |
| 343 | } | 368 | } |
| @@ -347,7 +372,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 347 | 372 | ||
| 348 | bool removeIf(char c) | 373 | bool removeIf(char c) |
| 349 | { | 374 | { |
| 350 | return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); | 375 | return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); |
| 351 | } | 376 | } |
| 352 | 377 | ||
| 353 | std::string canonize(std::string f) | 378 | std::string canonize(std::string f) |
| @@ -358,5 +383,5 @@ std::string canonize(std::string f) | |||
| 358 | std::string result; | 383 | std::string result; |
| 359 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 384 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
| 360 | 385 | ||
| 361 | return result; | 386 | return canonical; |
| 362 | } | 387 | } |
