diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
commit | d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch) | |
tree | 013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.cpp | |
parent | 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff) | |
download | rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip |
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 95 |
1 files changed, 60 insertions, 35 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus) | |||
55 | void rawr::compile(int maxK) | 55 | void rawr::compile(int maxK) |
56 | { | 56 | { |
57 | _maxK = maxK; | 57 | _maxK = maxK; |
58 | 58 | ||
59 | std::vector<std::vector<token>> tokens; | 59 | std::vector<std::vector<token_id>> tokens; |
60 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; |
61 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; |
62 | 62 | ||
@@ -120,8 +120,8 @@ void rawr::compile(int maxK) | |||
120 | { | 120 | { |
121 | size_t start = 0; | 121 | size_t start = 0; |
122 | int end = 0; | 122 | int end = 0; |
123 | std::vector<token> tkcor; | 123 | std::vector<token_id> tkcor; |
124 | 124 | ||
125 | while (end != std::string::npos) | 125 | while (end != std::string::npos) |
126 | { | 126 | { |
127 | perprime = (startper + end) * 100 / len; | 127 | perprime = (startper + end) * 100 / len; |
@@ -336,8 +336,8 @@ void rawr::compile(int maxK) | |||
336 | } | 336 | } |
337 | } | 337 | } |
338 | } | 338 | } |
339 | 339 | ||
340 | tkcor.push_back(tk); | 340 | tkcor.push_back(_tokenstore.add(tk)); |
341 | } | 341 | } |
342 | 342 | ||
343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
@@ -377,9 +377,12 @@ void rawr::compile(int maxK) | |||
377 | emoticons.forms.compile(); | 377 | emoticons.forms.compile(); |
378 | emoticons.terms.compile(); | 378 | emoticons.terms.compile(); |
379 | 379 | ||
380 | // Compile the interned tokens. | ||
381 | _tokenstore.compile(); | ||
382 | |||
380 | // kgram distribution | 383 | // kgram distribution |
381 | std::cout << "Creating markov chain... 0%" << std::flush; | 384 | std::cout << "Creating markov chain... 0%" << std::flush; |
382 | std::map<kgram, std::map<token, token_data> > tstats; | 385 | std::map<kgram, std::map<token_id, token_data> > tstats; |
383 | 386 | ||
384 | len = 0; | 387 | len = 0; |
385 | for (auto c : tokens) | 388 | for (auto c : tokens) |
@@ -408,14 +411,15 @@ void rawr::compile(int maxK) | |||
408 | } | 411 | } |
409 | 412 | ||
410 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); | 413 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); |
411 | token f = corpus[i+k]; | 414 | token_id fid = corpus[i+k]; |
415 | const token& f = _tokenstore.get(fid); | ||
412 | 416 | ||
413 | if (tstats[prefix].count(f) == 0) | 417 | if (tstats[prefix].count(fid) == 0) |
414 | { | 418 | { |
415 | tstats[prefix].emplace(f, f); | 419 | tstats[prefix].emplace(fid, fid); |
416 | } | 420 | } |
417 | 421 | ||
418 | token_data& td = tstats[prefix].at(f); | 422 | token_data& td = tstats[prefix].at(fid); |
419 | td.all++; | 423 | td.all++; |
420 | td.corpora.insert(corpid); | 424 | td.corpora.insert(corpid); |
421 | 425 | ||
@@ -426,19 +430,20 @@ void rawr::compile(int maxK) | |||
426 | { | 430 | { |
427 | td.titlecase++; | 431 | td.titlecase++; |
428 | } | 432 | } |
429 | 433 | ||
430 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | 434 | const token& startTok = _tokenstore.get(std::begin(prefix)->tok); |
435 | if (startTok.suffix == suffixtype::terminating) | ||
431 | { | 436 | { |
432 | kgram term_prefix(prefix); | 437 | kgram term_prefix(prefix); |
433 | term_prefix.pop_front(); | 438 | term_prefix.pop_front(); |
434 | term_prefix.push_front(wildcardQuery); | 439 | term_prefix.push_front(wildcardQuery); |
435 | 440 | ||
436 | if (tstats[term_prefix].count(f) == 0) | 441 | if (tstats[term_prefix].count(fid) == 0) |
437 | { | 442 | { |
438 | tstats[term_prefix].emplace(f, f); | 443 | tstats[term_prefix].emplace(fid, fid); |
439 | } | 444 | } |
440 | 445 | ||
441 | token_data& td2 = tstats[term_prefix].at(f); | 446 | token_data& td2 = tstats[term_prefix].at(fid); |
442 | td2.all++; | 447 | td2.all++; |
443 | td2.corpora.insert(corpid); | 448 | td2.corpora.insert(corpid); |
444 | 449 | ||
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const | |||
600 | int max = distribution.rbegin()->first; | 605 | int max = distribution.rbegin()->first; |
601 | int r = rand() % max; | 606 | int r = rand() % max; |
602 | const token_data& next = distribution.upper_bound(r)->second; | 607 | const token_data& next = distribution.upper_bound(r)->second; |
603 | std::string nextToken = next.tok.w.forms.next(); | 608 | const token& interned = _tokenstore.get(next.tok); |
604 | 609 | std::string nextToken = interned.w.forms.next(); | |
610 | |||
605 | // Apply user-specified transforms | 611 | // Apply user-specified transforms |
606 | if (_transform) | 612 | if (_transform) |
607 | { | 613 | { |
608 | nextToken = _transform(next.tok.w.canon, nextToken); | 614 | nextToken = _transform(interned.w.canon, nextToken); |
609 | } | 615 | } |
610 | 616 | ||
611 | // Determine the casing of the next token. We randomly make the token all | 617 | // Determine the casing of the next token. We randomly make the token all |
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const | |||
615 | if (casing < next.uppercase) | 621 | if (casing < next.uppercase) |
616 | { | 622 | { |
617 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 623 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
618 | } else if ((((cur.rbegin()->type == querytype::sentence) | 624 | } else { |
619 | || ((cur.rbegin()->type == querytype::literal) | 625 | bool capitalize = false; |
620 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | 626 | |
621 | && (rand() % 2 > 0)) | 627 | if (casing - next.uppercase < next.titlecase) |
622 | || (casing - next.uppercase < next.titlecase)) | 628 | { |
623 | { | 629 | capitalize = true; |
624 | nextToken[0] = toupper(nextToken[0]); | 630 | } else if (cur.rbegin()->type == querytype::sentence) |
631 | { | ||
632 | if (rand() % 2 > 0) | ||
633 | { | ||
634 | capitalize = true; | ||
635 | } | ||
636 | } else { | ||
637 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | ||
638 | |||
639 | if (lastTok.suffix == suffixtype::terminating && | ||
640 | rand() % 2 > 0) | ||
641 | { | ||
642 | capitalize = true; | ||
643 | } | ||
644 | } | ||
645 | |||
646 | if (capitalize) | ||
647 | { | ||
648 | nextToken[0] = toupper(nextToken[0]); | ||
649 | } | ||
625 | } | 650 | } |
626 | 651 | ||
627 | // Delimiters | 652 | // Delimiters |
628 | for (auto& dt : next.tok.delimiters) | 653 | for (auto& dt : interned.delimiters) |
629 | { | 654 | { |
630 | if (dt.first.status == doublestatus::both) | 655 | if (dt.first.status == doublestatus::both) |
631 | { | 656 | { |
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const | |||
692 | } | 717 | } |
693 | 718 | ||
694 | // Terminators | 719 | // Terminators |
695 | if (next.tok.suffix == suffixtype::terminating) | 720 | if (interned.suffix == suffixtype::terminating) |
696 | { | 721 | { |
697 | auto term = next.tok.w.terms.next(); | 722 | auto term = interned.w.terms.next(); |
698 | nextToken.append(term.form); | 723 | nextToken.append(term.form); |
699 | 724 | ||
700 | if (term.newline) | 725 | if (term.newline) |
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const | |||
703 | } else { | 728 | } else { |
704 | nextToken.append(" "); | 729 | nextToken.append(" "); |
705 | } | 730 | } |
706 | } else if (next.tok.suffix == suffixtype::comma) | 731 | } else if (interned.suffix == suffixtype::comma) |
707 | { | 732 | { |
708 | nextToken.append(", "); | 733 | nextToken.append(", "); |
709 | } else { | 734 | } else { |
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const | |||
734 | 759 | ||
735 | cur.push_back(next.tok); | 760 | cur.push_back(next.tok); |
736 | result.append(nextToken); | 761 | result.append(nextToken); |
737 | 762 | ||
738 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 763 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
739 | { | 764 | { |
740 | break; | 765 | break; |
741 | } | 766 | } |