diff options
| -rw-r--r-- | kgramstats.cpp | 56 |
1 files changed, 52 insertions, 4 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 8a21d60..07f41f6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -91,9 +91,23 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 91 | emoji_file.close(); | 91 | emoji_file.close(); |
| 92 | } | 92 | } |
| 93 | 93 | ||
| 94 | std::cout << "Tokenizing corpus..." << std::endl; | 94 | std::cout << "Tokenizing corpus... 0%" << std::flush; |
| 95 | int len = corpus.length(); | ||
| 96 | int per = 0; | ||
| 97 | int perprime = 0; | ||
| 98 | std::cout.fill(' '); | ||
| 95 | while (end != std::string::npos) | 99 | while (end != std::string::npos) |
| 96 | { | 100 | { |
| 101 | perprime = end * 100 / len; | ||
| 102 | if (perprime != per) | ||
| 103 | { | ||
| 104 | per = perprime; | ||
| 105 | |||
| 106 | std::cout << "\b\b\b\b" << std::right; | ||
| 107 | std::cout.width(3); | ||
| 108 | std::cout << per << "%" << std::flush; | ||
| 109 | } | ||
| 110 | |||
| 97 | end = corpus.find(" ", start); | 111 | end = corpus.find(" ", start); |
| 98 | 112 | ||
| 99 | bool emoji = false; | 113 | bool emoji = false; |
| @@ -294,6 +308,8 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 294 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 308 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 295 | } | 309 | } |
| 296 | 310 | ||
| 311 | std::cout << "\b\b\b\b100%" << std::endl; | ||
| 312 | |||
| 297 | delete_aspell_speller(spell_checker); | 313 | delete_aspell_speller(spell_checker); |
| 298 | delete_aspell_config(spell_config); | 314 | delete_aspell_config(spell_config); |
| 299 | 315 | ||
| @@ -322,12 +338,25 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 322 | emoticons.terms.compile(); | 338 | emoticons.terms.compile(); |
| 323 | 339 | ||
| 324 | // kgram distribution | 340 | // kgram distribution |
| 325 | std::cout << "Creating markov chain..." << std::endl; | 341 | std::cout << "Creating markov chain... 0%" << std::flush; |
| 326 | std::map<kgram, std::map<token, token_data> > tstats; | 342 | std::map<kgram, std::map<token, token_data> > tstats; |
| 343 | len = (maxK-1) * tokens.size(); | ||
| 344 | per = 0; | ||
| 345 | perprime = 0; | ||
| 327 | for (int k=1; k<maxK; k++) | 346 | for (int k=1; k<maxK; k++) |
| 328 | { | 347 | { |
| 329 | for (int i=0; i<(tokens.size() - k); i++) | 348 | for (int i=0; i<(tokens.size() - k); i++) |
| 330 | { | 349 | { |
| 350 | perprime = (((k-1)*tokens.size())+i) * 100 / len; | ||
| 351 | if (perprime != per) | ||
| 352 | { | ||
| 353 | per = perprime; | ||
| 354 | |||
| 355 | std::cout << "\b\b\b\b" << std::right; | ||
| 356 | std::cout.width(3); | ||
| 357 | std::cout << per << "%" << std::flush; | ||
| 358 | } | ||
| 359 | |||
| 331 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); | 360 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
| 332 | token f = tokens[i+k]; | 361 | token f = tokens[i+k]; |
| 333 | 362 | ||
| @@ -371,11 +400,28 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 371 | } | 400 | } |
| 372 | } | 401 | } |
| 373 | } | 402 | } |
| 374 | 403 | ||
| 404 | std::cout << "\b\b\b\b100%" << std::endl; | ||
| 405 | |||
| 375 | // Condense the kgram distribution | 406 | // Condense the kgram distribution |
| 376 | std::cout << "Compiling kgram distributions..." << std::endl; | 407 | std::cout << "Compiling kgram distributions... 0%"; |
| 408 | len = tstats.size(); | ||
| 409 | per = 0; | ||
| 410 | perprime = 0; | ||
| 411 | int indicator = 0; | ||
| 377 | for (auto& it : tstats) | 412 | for (auto& it : tstats) |
| 378 | { | 413 | { |
| 414 | indicator++; | ||
| 415 | perprime = indicator * 100 / len; | ||
| 416 | if (per != perprime) | ||
| 417 | { | ||
| 418 | per = perprime; | ||
| 419 | |||
| 420 | std::cout << "\b\b\b\b" << std::right; | ||
| 421 | std::cout.width(3); | ||
| 422 | std::cout << per << "%" << std::flush; | ||
| 423 | } | ||
| 424 | |||
| 379 | kgram klist = it.first; | 425 | kgram klist = it.first; |
| 380 | auto& probtable = it.second; | 426 | auto& probtable = it.second; |
| 381 | auto& distribution = stats[klist]; | 427 | auto& distribution = stats[klist]; |
| @@ -388,6 +434,8 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 388 | distribution.emplace(max, kt.second); | 434 | distribution.emplace(max, kt.second); |
| 389 | } | 435 | } |
| 390 | } | 436 | } |
| 437 | |||
| 438 | std::cout << "\b\b\b\b100%" << std::endl; | ||
| 391 | } | 439 | } |
| 392 | 440 | ||
| 393 | void printKgram(kgram k) | 441 | void printKgram(kgram k) |
