diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-20 00:56:37 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-20 00:56:37 -0500 |
commit | 1ed82c3071218d759cae6966fa6e9dbf47d38003 (patch) | |
tree | ee989fbc01cc4b9b69e7112ef44c6921a83f4f76 | |
parent | b090f6db27534d0bd0bbfaf068efada7b30aa5ac (diff) | |
download | rawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.tar.gz rawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.tar.bz2 rawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.zip |
Added percentage display to preprocessing stage
-rw-r--r-- | kgramstats.cpp | 56 |
1 files changed, 52 insertions, 4 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 8a21d60..07f41f6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -91,9 +91,23 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
91 | emoji_file.close(); | 91 | emoji_file.close(); |
92 | } | 92 | } |
93 | 93 | ||
94 | std::cout << "Tokenizing corpus..." << std::endl; | 94 | std::cout << "Tokenizing corpus... 0%" << std::flush; |
95 | int len = corpus.length(); | ||
96 | int per = 0; | ||
97 | int perprime = 0; | ||
98 | std::cout.fill(' '); | ||
95 | while (end != std::string::npos) | 99 | while (end != std::string::npos) |
96 | { | 100 | { |
101 | perprime = end * 100 / len; | ||
102 | if (perprime != per) | ||
103 | { | ||
104 | per = perprime; | ||
105 | |||
106 | std::cout << "\b\b\b\b" << std::right; | ||
107 | std::cout.width(3); | ||
108 | std::cout << per << "%" << std::flush; | ||
109 | } | ||
110 | |||
97 | end = corpus.find(" ", start); | 111 | end = corpus.find(" ", start); |
98 | 112 | ||
99 | bool emoji = false; | 113 | bool emoji = false; |
@@ -294,6 +308,8 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
294 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 308 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
295 | } | 309 | } |
296 | 310 | ||
311 | std::cout << "\b\b\b\b100%" << std::endl; | ||
312 | |||
297 | delete_aspell_speller(spell_checker); | 313 | delete_aspell_speller(spell_checker); |
298 | delete_aspell_config(spell_config); | 314 | delete_aspell_config(spell_config); |
299 | 315 | ||
@@ -322,12 +338,25 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
322 | emoticons.terms.compile(); | 338 | emoticons.terms.compile(); |
323 | 339 | ||
324 | // kgram distribution | 340 | // kgram distribution |
325 | std::cout << "Creating markov chain..." << std::endl; | 341 | std::cout << "Creating markov chain... 0%" << std::flush; |
326 | std::map<kgram, std::map<token, token_data> > tstats; | 342 | std::map<kgram, std::map<token, token_data> > tstats; |
343 | len = (maxK-1) * tokens.size(); | ||
344 | per = 0; | ||
345 | perprime = 0; | ||
327 | for (int k=1; k<maxK; k++) | 346 | for (int k=1; k<maxK; k++) |
328 | { | 347 | { |
329 | for (int i=0; i<(tokens.size() - k); i++) | 348 | for (int i=0; i<(tokens.size() - k); i++) |
330 | { | 349 | { |
350 | perprime = (((k-1)*tokens.size())+i) * 100 / len; | ||
351 | if (perprime != per) | ||
352 | { | ||
353 | per = perprime; | ||
354 | |||
355 | std::cout << "\b\b\b\b" << std::right; | ||
356 | std::cout.width(3); | ||
357 | std::cout << per << "%" << std::flush; | ||
358 | } | ||
359 | |||
331 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); | 360 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
332 | token f = tokens[i+k]; | 361 | token f = tokens[i+k]; |
333 | 362 | ||
@@ -371,11 +400,28 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
371 | } | 400 | } |
372 | } | 401 | } |
373 | } | 402 | } |
374 | 403 | ||
404 | std::cout << "\b\b\b\b100%" << std::endl; | ||
405 | |||
375 | // Condense the kgram distribution | 406 | // Condense the kgram distribution |
376 | std::cout << "Compiling kgram distributions..." << std::endl; | 407 | std::cout << "Compiling kgram distributions... 0%"; |
408 | len = tstats.size(); | ||
409 | per = 0; | ||
410 | perprime = 0; | ||
411 | int indicator = 0; | ||
377 | for (auto& it : tstats) | 412 | for (auto& it : tstats) |
378 | { | 413 | { |
414 | indicator++; | ||
415 | perprime = indicator * 100 / len; | ||
416 | if (per != perprime) | ||
417 | { | ||
418 | per = perprime; | ||
419 | |||
420 | std::cout << "\b\b\b\b" << std::right; | ||
421 | std::cout.width(3); | ||
422 | std::cout << per << "%" << std::flush; | ||
423 | } | ||
424 | |||
379 | kgram klist = it.first; | 425 | kgram klist = it.first; |
380 | auto& probtable = it.second; | 426 | auto& probtable = it.second; |
381 | auto& distribution = stats[klist]; | 427 | auto& distribution = stats[klist]; |
@@ -388,6 +434,8 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
388 | distribution.emplace(max, kt.second); | 434 | distribution.emplace(max, kt.second); |
389 | } | 435 | } |
390 | } | 436 | } |
437 | |||
438 | std::cout << "\b\b\b\b100%" << std::endl; | ||
391 | } | 439 | } |
392 | 440 | ||
393 | void printKgram(kgram k) | 441 | void printKgram(kgram k) |