about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-20 00:56:37 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-20 00:56:37 -0500
commit1ed82c3071218d759cae6966fa6e9dbf47d38003 (patch)
treeee989fbc01cc4b9b69e7112ef44c6921a83f4f76
parentb090f6db27534d0bd0bbfaf068efada7b30aa5ac (diff)
downloadrawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.tar.gz
rawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.tar.bz2
rawr-ebooks-1ed82c3071218d759cae6966fa6e9dbf47d38003.zip
Added percentage display to preprocessing stage
-rw-r--r--kgramstats.cpp56
1 files changed, 52 insertions, 4 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 8a21d60..07f41f6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -91,9 +91,23 @@ kgramstats::kgramstats(std::string corpus, int maxK)
91 emoji_file.close(); 91 emoji_file.close();
92 } 92 }
93 93
94 std::cout << "Tokenizing corpus..." << std::endl; 94 std::cout << "Tokenizing corpus... 0%" << std::flush;
95 int len = corpus.length();
96 int per = 0;
97 int perprime = 0;
98 std::cout.fill(' ');
95 while (end != std::string::npos) 99 while (end != std::string::npos)
96 { 100 {
101 perprime = end * 100 / len;
102 if (perprime != per)
103 {
104 per = perprime;
105
106 std::cout << "\b\b\b\b" << std::right;
107 std::cout.width(3);
108 std::cout << per << "%" << std::flush;
109 }
110
97 end = corpus.find(" ", start); 111 end = corpus.find(" ", start);
98 112
99 bool emoji = false; 113 bool emoji = false;
@@ -294,6 +308,8 @@ kgramstats::kgramstats(std::string corpus, int maxK)
294 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 308 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
295 } 309 }
296 310
311 std::cout << "\b\b\b\b100%" << std::endl;
312
297 delete_aspell_speller(spell_checker); 313 delete_aspell_speller(spell_checker);
298 delete_aspell_config(spell_config); 314 delete_aspell_config(spell_config);
299 315
@@ -322,12 +338,25 @@ kgramstats::kgramstats(std::string corpus, int maxK)
322 emoticons.terms.compile(); 338 emoticons.terms.compile();
323 339
324 // kgram distribution 340 // kgram distribution
325 std::cout << "Creating markov chain..." << std::endl; 341 std::cout << "Creating markov chain... 0%" << std::flush;
326 std::map<kgram, std::map<token, token_data> > tstats; 342 std::map<kgram, std::map<token, token_data> > tstats;
343 len = (maxK-1) * tokens.size();
344 per = 0;
345 perprime = 0;
327 for (int k=1; k<maxK; k++) 346 for (int k=1; k<maxK; k++)
328 { 347 {
329 for (int i=0; i<(tokens.size() - k); i++) 348 for (int i=0; i<(tokens.size() - k); i++)
330 { 349 {
350 perprime = (((k-1)*tokens.size())+i) * 100 / len;
351 if (perprime != per)
352 {
353 per = perprime;
354
355 std::cout << "\b\b\b\b" << std::right;
356 std::cout.width(3);
357 std::cout << per << "%" << std::flush;
358 }
359
331 kgram prefix(tokens.begin()+i, tokens.begin()+i+k); 360 kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
332 token f = tokens[i+k]; 361 token f = tokens[i+k];
333 362
@@ -371,11 +400,28 @@ kgramstats::kgramstats(std::string corpus, int maxK)
371 } 400 }
372 } 401 }
373 } 402 }
374 403
404 std::cout << "\b\b\b\b100%" << std::endl;
405
375 // Condense the kgram distribution 406 // Condense the kgram distribution
376 std::cout << "Compiling kgram distributions..." << std::endl; 407 std::cout << "Compiling kgram distributions... 0%";
408 len = tstats.size();
409 per = 0;
410 perprime = 0;
411 int indicator = 0;
377 for (auto& it : tstats) 412 for (auto& it : tstats)
378 { 413 {
414 indicator++;
415 perprime = indicator * 100 / len;
416 if (per != perprime)
417 {
418 per = perprime;
419
420 std::cout << "\b\b\b\b" << std::right;
421 std::cout.width(3);
422 std::cout << per << "%" << std::flush;
423 }
424
379 kgram klist = it.first; 425 kgram klist = it.first;
380 auto& probtable = it.second; 426 auto& probtable = it.second;
381 auto& distribution = stats[klist]; 427 auto& distribution = stats[klist];
@@ -388,6 +434,8 @@ kgramstats::kgramstats(std::string corpus, int maxK)
388 distribution.emplace(max, kt.second); 434 distribution.emplace(max, kt.second);
389 } 435 }
390 } 436 }
437
438 std::cout << "\b\b\b\b100%" << std::endl;
391} 439}
392 440
393void printKgram(kgram k) 441void printKgram(kgram k)