about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-31 21:39:39 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-31 21:39:39 -0400
commit5ce05b81520d06a78165c5c5039007c9f29d4b23 (patch)
tree118eb3d689a0dfe2581b4a941efa242708dda31c
parentae60f5f679da06b3824accdd14482189fec9dc85 (diff)
downloadrawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.gz
rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.bz2
rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.zip
Added ability to require a minimum number of corpora in generated output
Also fixed a bug with tokenizing multiple corpora.
-rw-r--r--kgramstats.cpp134
-rw-r--r--kgramstats.h4
2 files changed, 91 insertions, 47 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index e0c2eac..cb63db6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -56,8 +56,7 @@ void rawr::compile(int maxK)
56{ 56{
57 _maxK = maxK; 57 _maxK = maxK;
58 58
59 std::vector<token> tokens; 59 std::vector<std::vector<token>> tokens;
60 size_t start = 0;
61 std::set<std::string> thashtags; 60 std::set<std::string> thashtags;
62 std::set<std::string> fv_emoticons; 61 std::set<std::string> fv_emoticons;
63 62
@@ -119,7 +118,9 @@ void rawr::compile(int maxK)
119 std::cout.fill(' '); 118 std::cout.fill(' ');
120 for (int i = 0; i < _corpora.size(); i++) 119 for (int i = 0; i < _corpora.size(); i++)
121 { 120 {
121 size_t start = 0;
122 int end = 0; 122 int end = 0;
123 std::vector<token> tkcor;
123 124
124 while (end != std::string::npos) 125 while (end != std::string::npos)
125 { 126 {
@@ -331,12 +332,14 @@ void rawr::compile(int maxK)
331 } 332 }
332 } 333 }
333 334
334 tokens.push_back(tk); 335 tkcor.push_back(tk);
335 } 336 }
336 337
337 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 338 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
338 } 339 }
339 340
341 tokens.push_back(tkcor);
342
340 startper += _corpora[i].length(); 343 startper += _corpora[i].length();
341 } 344 }
342 345
@@ -372,65 +375,82 @@ void rawr::compile(int maxK)
372 // kgram distribution 375 // kgram distribution
373 std::cout << "Creating markov chain... 0%" << std::flush; 376 std::cout << "Creating markov chain... 0%" << std::flush;
374 std::map<kgram, std::map<token, token_data> > tstats; 377 std::map<kgram, std::map<token, token_data> > tstats;
375 len = (maxK-1) * tokens.size(); 378
379 len = 0;
380 for (auto c : tokens)
381 {
382 len += (maxK-1) * c.size();
383 }
384
385 startper = 0;
376 per = 0; 386 per = 0;
377 perprime = 0; 387 perprime = 0;
378 for (int k=1; k<maxK; k++) 388 int corpid = 0;
389 for (auto corpus : tokens)
379 { 390 {
380 for (int i=0; i<(tokens.size() - k); i++) 391 for (int k=1; k<maxK; k++)
381 { 392 {
382 perprime = (((k-1)*tokens.size())+i) * 100 / len; 393 for (int i=0; i<(corpus.size() - k); i++)
383 if (perprime != per)
384 { 394 {
385 per = perprime; 395 perprime = (startper+i) * 100 / len;
396 if (perprime != per)
397 {
398 per = perprime;
386 399
387 std::cout << "\b\b\b\b" << std::right; 400 std::cout << "\b\b\b\b" << std::right;
388 std::cout.width(3); 401 std::cout.width(3);
389 std::cout << per << "%" << std::flush; 402 std::cout << per << "%" << std::flush;
390 } 403 }
391 404
392 kgram prefix(tokens.begin()+i, tokens.begin()+i+k); 405 kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
393 token f = tokens[i+k]; 406 token f = corpus[i+k];
394 407
395 if (tstats[prefix].count(f) == 0) 408 if (tstats[prefix].count(f) == 0)
396 {
397 tstats[prefix].emplace(f, f);
398 }
399
400 token_data& td = tstats[prefix].at(f);
401 td.all++;
402
403 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
404 {
405 td.uppercase++;
406 } else if (isupper(f.raw[0]))
407 {
408 td.titlecase++;
409 }
410
411 if (std::begin(prefix)->tok.suffix == suffixtype::terminating)
412 {
413 kgram term_prefix(prefix);
414 term_prefix.pop_front();
415 term_prefix.push_front(wildcardQuery);
416
417 if (tstats[term_prefix].count(f) == 0)
418 { 409 {
419 tstats[term_prefix].emplace(f, f); 410 tstats[prefix].emplace(f, f);
420 } 411 }
421 412
422 token_data& td2 = tstats[term_prefix].at(f); 413 token_data& td = tstats[prefix].at(f);
423 td2.all++; 414 td.all++;
415 td.corpora.insert(corpid);
424 416
425 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) 417 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
426 { 418 {
427 td2.uppercase++; 419 td.uppercase++;
428 } else if (isupper(f.raw[0])) 420 } else if (isupper(f.raw[0]))
429 { 421 {
430 td2.titlecase++; 422 td.titlecase++;
423 }
424
425 if (std::begin(prefix)->tok.suffix == suffixtype::terminating)
426 {
427 kgram term_prefix(prefix);
428 term_prefix.pop_front();
429 term_prefix.push_front(wildcardQuery);
430
431 if (tstats[term_prefix].count(f) == 0)
432 {
433 tstats[term_prefix].emplace(f, f);
434 }
435
436 token_data& td2 = tstats[term_prefix].at(f);
437 td2.all++;
438 td2.corpora.insert(corpid);
439
440 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
441 {
442 td2.uppercase++;
443 } else if (isupper(f.raw[0]))
444 {
445 td2.titlecase++;
446 }
431 } 447 }
432 } 448 }
449
450 startper += corpus.size();
433 } 451 }
452
453 corpid++;
434 } 454 }
435 455
436 std::cout << "\b\b\b\b100%" << std::endl; 456 std::cout << "\b\b\b\b100%" << std::endl;
@@ -527,6 +547,11 @@ void rawr::setTransformCallback(transform_callback _arg)
527 _transform = _arg; 547 _transform = _arg;
528} 548}
529 549
550void rawr::setMinCorpora(int _arg)
551{
552 _min_corpora = _arg;
553}
554
530// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 555// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
531std::string rawr::randomSentence(int maxL) 556std::string rawr::randomSentence(int maxL)
532{ 557{
@@ -539,6 +564,7 @@ std::string rawr::randomSentence(int maxL)
539 kgram cur(1, wildcardQuery); 564 kgram cur(1, wildcardQuery);
540 int cuts = 0; 565 int cuts = 0;
541 std::stack<parentype> open_delimiters; 566 std::stack<parentype> open_delimiters;
567 std::set<int> used_corpora;
542 568
543 for (;;) 569 for (;;)
544 { 570 {
@@ -690,9 +716,19 @@ std::string rawr::randomSentence(int maxL)
690 { 716 {
691 cuts++; 717 cuts++;
692 } 718 }
719
720 if (next.corpora.size() == 1)
721 {
722 used_corpora.insert(*next.corpora.begin());
723 }
693 724
694 /* DEBUG */ 725 /* DEBUG */
695 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; 726 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << " in corp";
727 for (auto cor : next.corpora)
728 {
729 std::cout << " " << cor;
730 }
731 std::cout << std::endl;
696 732
697 cur.push_back(next.tok); 733 cur.push_back(next.tok);
698 result.append(nextToken); 734 result.append(nextToken);
@@ -703,6 +739,12 @@ std::string rawr::randomSentence(int maxL)
703 } 739 }
704 } 740 }
705 741
742 // Ensure that enough corpora are used
743 if (used_corpora.size() < _min_corpora)
744 {
745 return randomSentence(maxL);
746 }
747
706 // Remove the trailing space 748 // Remove the trailing space
707 if (result.back() == ' ') 749 if (result.back() == ' ')
708 { 750 {
@@ -722,8 +764,6 @@ std::string rawr::randomSentence(int maxL)
722 764
723 open_delimiters.pop(); 765 open_delimiters.pop();
724 } 766 }
725
726 result.resize(maxL);
727 767
728 return result; 768 return result;
729} 769}
diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,6 +7,7 @@
7#include <vector> 7#include <vector>
8#include "histogram.h" 8#include "histogram.h"
9#include <functional> 9#include <functional>
10#include <set>
10 11
11class rawr { 12class rawr {
12 public: 13 public:
@@ -16,6 +17,7 @@ class rawr {
16 void compile(int maxK); 17 void compile(int maxK);
17 18
18 void setTransformCallback(transform_callback _arg); 19 void setTransformCallback(transform_callback _arg);
20 void setMinCorpora(int _arg);
19 std::string randomSentence(int maxL); 21 std::string randomSentence(int maxL);
20 22
21 private: 23 private:
@@ -125,6 +127,7 @@ class rawr {
125 int titlecase; 127 int titlecase;
126 int uppercase; 128 int uppercase;
127 token tok; 129 token tok;
130 std::set<int> corpora;
128 131
129 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 132 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
130 }; 133 };
@@ -139,6 +142,7 @@ class rawr {
139 std::vector<std::string> _corpora; 142 std::vector<std::string> _corpora;
140 std::map<kgram, std::map<int, token_data>> _stats; 143 std::map<kgram, std::map<int, token_data>> _stats;
141 transform_callback _transform; 144 transform_callback _transform;
145 int _min_corpora = 1;
142 146
143 // Words 147 // Words
144 std::map<std::string, word> words; 148 std::map<std::string, word> words;