diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-31 21:39:39 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-31 21:39:39 -0400 |
commit | 5ce05b81520d06a78165c5c5039007c9f29d4b23 (patch) | |
tree | 118eb3d689a0dfe2581b4a941efa242708dda31c | |
parent | ae60f5f679da06b3824accdd14482189fec9dc85 (diff) | |
download | rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.gz rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.bz2 rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.zip |
Added ability to require a minimum number of corpora in generated output
Also fixed a bug with tokenizing multiple corpora.
-rw-r--r-- | kgramstats.cpp | 134 | ||||
-rw-r--r-- | kgramstats.h | 4 |
2 files changed, 91 insertions, 47 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index e0c2eac..cb63db6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -56,8 +56,7 @@ void rawr::compile(int maxK) | |||
56 | { | 56 | { |
57 | _maxK = maxK; | 57 | _maxK = maxK; |
58 | 58 | ||
59 | std::vector<token> tokens; | 59 | std::vector<std::vector<token>> tokens; |
60 | size_t start = 0; | ||
61 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; |
62 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; |
63 | 62 | ||
@@ -119,7 +118,9 @@ void rawr::compile(int maxK) | |||
119 | std::cout.fill(' '); | 118 | std::cout.fill(' '); |
120 | for (int i = 0; i < _corpora.size(); i++) | 119 | for (int i = 0; i < _corpora.size(); i++) |
121 | { | 120 | { |
121 | size_t start = 0; | ||
122 | int end = 0; | 122 | int end = 0; |
123 | std::vector<token> tkcor; | ||
123 | 124 | ||
124 | while (end != std::string::npos) | 125 | while (end != std::string::npos) |
125 | { | 126 | { |
@@ -331,12 +332,14 @@ void rawr::compile(int maxK) | |||
331 | } | 332 | } |
332 | } | 333 | } |
333 | 334 | ||
334 | tokens.push_back(tk); | 335 | tkcor.push_back(tk); |
335 | } | 336 | } |
336 | 337 | ||
337 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 338 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
338 | } | 339 | } |
339 | 340 | ||
341 | tokens.push_back(tkcor); | ||
342 | |||
340 | startper += _corpora[i].length(); | 343 | startper += _corpora[i].length(); |
341 | } | 344 | } |
342 | 345 | ||
@@ -372,65 +375,82 @@ void rawr::compile(int maxK) | |||
372 | // kgram distribution | 375 | // kgram distribution |
373 | std::cout << "Creating markov chain... 0%" << std::flush; | 376 | std::cout << "Creating markov chain... 0%" << std::flush; |
374 | std::map<kgram, std::map<token, token_data> > tstats; | 377 | std::map<kgram, std::map<token, token_data> > tstats; |
375 | len = (maxK-1) * tokens.size(); | 378 | |
379 | len = 0; | ||
380 | for (auto c : tokens) | ||
381 | { | ||
382 | len += (maxK-1) * c.size(); | ||
383 | } | ||
384 | |||
385 | startper = 0; | ||
376 | per = 0; | 386 | per = 0; |
377 | perprime = 0; | 387 | perprime = 0; |
378 | for (int k=1; k<maxK; k++) | 388 | int corpid = 0; |
389 | for (auto corpus : tokens) | ||
379 | { | 390 | { |
380 | for (int i=0; i<(tokens.size() - k); i++) | 391 | for (int k=1; k<maxK; k++) |
381 | { | 392 | { |
382 | perprime = (((k-1)*tokens.size())+i) * 100 / len; | 393 | for (int i=0; i<(corpus.size() - k); i++) |
383 | if (perprime != per) | ||
384 | { | 394 | { |
385 | per = perprime; | 395 | perprime = (startper+i) * 100 / len; |
396 | if (perprime != per) | ||
397 | { | ||
398 | per = perprime; | ||
386 | 399 | ||
387 | std::cout << "\b\b\b\b" << std::right; | 400 | std::cout << "\b\b\b\b" << std::right; |
388 | std::cout.width(3); | 401 | std::cout.width(3); |
389 | std::cout << per << "%" << std::flush; | 402 | std::cout << per << "%" << std::flush; |
390 | } | 403 | } |
391 | 404 | ||
392 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); | 405 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); |
393 | token f = tokens[i+k]; | 406 | token f = corpus[i+k]; |
394 | 407 | ||
395 | if (tstats[prefix].count(f) == 0) | 408 | if (tstats[prefix].count(f) == 0) |
396 | { | ||
397 | tstats[prefix].emplace(f, f); | ||
398 | } | ||
399 | |||
400 | token_data& td = tstats[prefix].at(f); | ||
401 | td.all++; | ||
402 | |||
403 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
404 | { | ||
405 | td.uppercase++; | ||
406 | } else if (isupper(f.raw[0])) | ||
407 | { | ||
408 | td.titlecase++; | ||
409 | } | ||
410 | |||
411 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | ||
412 | { | ||
413 | kgram term_prefix(prefix); | ||
414 | term_prefix.pop_front(); | ||
415 | term_prefix.push_front(wildcardQuery); | ||
416 | |||
417 | if (tstats[term_prefix].count(f) == 0) | ||
418 | { | 409 | { |
419 | tstats[term_prefix].emplace(f, f); | 410 | tstats[prefix].emplace(f, f); |
420 | } | 411 | } |
421 | 412 | ||
422 | token_data& td2 = tstats[term_prefix].at(f); | 413 | token_data& td = tstats[prefix].at(f); |
423 | td2.all++; | 414 | td.all++; |
415 | td.corpora.insert(corpid); | ||
424 | 416 | ||
425 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | 417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
426 | { | 418 | { |
427 | td2.uppercase++; | 419 | td.uppercase++; |
428 | } else if (isupper(f.raw[0])) | 420 | } else if (isupper(f.raw[0])) |
429 | { | 421 | { |
430 | td2.titlecase++; | 422 | td.titlecase++; |
423 | } | ||
424 | |||
425 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | ||
426 | { | ||
427 | kgram term_prefix(prefix); | ||
428 | term_prefix.pop_front(); | ||
429 | term_prefix.push_front(wildcardQuery); | ||
430 | |||
431 | if (tstats[term_prefix].count(f) == 0) | ||
432 | { | ||
433 | tstats[term_prefix].emplace(f, f); | ||
434 | } | ||
435 | |||
436 | token_data& td2 = tstats[term_prefix].at(f); | ||
437 | td2.all++; | ||
438 | td2.corpora.insert(corpid); | ||
439 | |||
440 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
441 | { | ||
442 | td2.uppercase++; | ||
443 | } else if (isupper(f.raw[0])) | ||
444 | { | ||
445 | td2.titlecase++; | ||
446 | } | ||
431 | } | 447 | } |
432 | } | 448 | } |
449 | |||
450 | startper += corpus.size(); | ||
433 | } | 451 | } |
452 | |||
453 | corpid++; | ||
434 | } | 454 | } |
435 | 455 | ||
436 | std::cout << "\b\b\b\b100%" << std::endl; | 456 | std::cout << "\b\b\b\b100%" << std::endl; |
@@ -527,6 +547,11 @@ void rawr::setTransformCallback(transform_callback _arg) | |||
527 | _transform = _arg; | 547 | _transform = _arg; |
528 | } | 548 | } |
529 | 549 | ||
550 | void rawr::setMinCorpora(int _arg) | ||
551 | { | ||
552 | _min_corpora = _arg; | ||
553 | } | ||
554 | |||
530 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 555 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
531 | std::string rawr::randomSentence(int maxL) | 556 | std::string rawr::randomSentence(int maxL) |
532 | { | 557 | { |
@@ -539,6 +564,7 @@ std::string rawr::randomSentence(int maxL) | |||
539 | kgram cur(1, wildcardQuery); | 564 | kgram cur(1, wildcardQuery); |
540 | int cuts = 0; | 565 | int cuts = 0; |
541 | std::stack<parentype> open_delimiters; | 566 | std::stack<parentype> open_delimiters; |
567 | std::set<int> used_corpora; | ||
542 | 568 | ||
543 | for (;;) | 569 | for (;;) |
544 | { | 570 | { |
@@ -690,9 +716,19 @@ std::string rawr::randomSentence(int maxL) | |||
690 | { | 716 | { |
691 | cuts++; | 717 | cuts++; |
692 | } | 718 | } |
719 | |||
720 | if (next.corpora.size() == 1) | ||
721 | { | ||
722 | used_corpora.insert(*next.corpora.begin()); | ||
723 | } | ||
693 | 724 | ||
694 | /* DEBUG */ | 725 | /* DEBUG */ |
695 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | 726 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << " in corp"; |
727 | for (auto cor : next.corpora) | ||
728 | { | ||
729 | std::cout << " " << cor; | ||
730 | } | ||
731 | std::cout << std::endl; | ||
696 | 732 | ||
697 | cur.push_back(next.tok); | 733 | cur.push_back(next.tok); |
698 | result.append(nextToken); | 734 | result.append(nextToken); |
@@ -703,6 +739,12 @@ std::string rawr::randomSentence(int maxL) | |||
703 | } | 739 | } |
704 | } | 740 | } |
705 | 741 | ||
742 | // Ensure that enough corpora are used | ||
743 | if (used_corpora.size() < _min_corpora) | ||
744 | { | ||
745 | return randomSentence(maxL); | ||
746 | } | ||
747 | |||
706 | // Remove the trailing space | 748 | // Remove the trailing space |
707 | if (result.back() == ' ') | 749 | if (result.back() == ' ') |
708 | { | 750 | { |
@@ -722,8 +764,6 @@ std::string rawr::randomSentence(int maxL) | |||
722 | 764 | ||
723 | open_delimiters.pop(); | 765 | open_delimiters.pop(); |
724 | } | 766 | } |
725 | |||
726 | result.resize(maxL); | ||
727 | 767 | ||
728 | return result; | 768 | return result; |
729 | } | 769 | } |
diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <vector> | 7 | #include <vector> |
8 | #include "histogram.h" | 8 | #include "histogram.h" |
9 | #include <functional> | 9 | #include <functional> |
10 | #include <set> | ||
10 | 11 | ||
11 | class rawr { | 12 | class rawr { |
12 | public: | 13 | public: |
@@ -16,6 +17,7 @@ class rawr { | |||
16 | void compile(int maxK); | 17 | void compile(int maxK); |
17 | 18 | ||
18 | void setTransformCallback(transform_callback _arg); | 19 | void setTransformCallback(transform_callback _arg); |
20 | void setMinCorpora(int _arg); | ||
19 | std::string randomSentence(int maxL); | 21 | std::string randomSentence(int maxL); |
20 | 22 | ||
21 | private: | 23 | private: |
@@ -125,6 +127,7 @@ class rawr { | |||
125 | int titlecase; | 127 | int titlecase; |
126 | int uppercase; | 128 | int uppercase; |
127 | token tok; | 129 | token tok; |
130 | std::set<int> corpora; | ||
128 | 131 | ||
129 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
130 | }; | 133 | }; |
@@ -139,6 +142,7 @@ class rawr { | |||
139 | std::vector<std::string> _corpora; | 142 | std::vector<std::string> _corpora; |
140 | std::map<kgram, std::map<int, token_data>> _stats; | 143 | std::map<kgram, std::map<int, token_data>> _stats; |
141 | transform_callback _transform; | 144 | transform_callback _transform; |
145 | int _min_corpora = 1; | ||
142 | 146 | ||
143 | // Words | 147 | // Words |
144 | std::map<std::string, word> words; | 148 | std::map<std::string, word> words; |