diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 | 
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 | 
| commit | d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch) | |
| tree | 013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.cpp | |
| parent | 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff) | |
| download | rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip | |
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.cpp')
| -rw-r--r-- | kgramstats.cpp | 95 | 
1 files changed, 60 insertions, 35 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus) | |||
| 55 | void rawr::compile(int maxK) | 55 | void rawr::compile(int maxK) | 
| 56 | { | 56 | { | 
| 57 | _maxK = maxK; | 57 | _maxK = maxK; | 
| 58 | 58 | ||
| 59 | std::vector<std::vector<token>> tokens; | 59 | std::vector<std::vector<token_id>> tokens; | 
| 60 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; | 
| 61 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; | 
| 62 | 62 | ||
| @@ -120,8 +120,8 @@ void rawr::compile(int maxK) | |||
| 120 | { | 120 | { | 
| 121 | size_t start = 0; | 121 | size_t start = 0; | 
| 122 | int end = 0; | 122 | int end = 0; | 
| 123 | std::vector<token> tkcor; | 123 | std::vector<token_id> tkcor; | 
| 124 | 124 | ||
| 125 | while (end != std::string::npos) | 125 | while (end != std::string::npos) | 
| 126 | { | 126 | { | 
| 127 | perprime = (startper + end) * 100 / len; | 127 | perprime = (startper + end) * 100 / len; | 
| @@ -336,8 +336,8 @@ void rawr::compile(int maxK) | |||
| 336 | } | 336 | } | 
| 337 | } | 337 | } | 
| 338 | } | 338 | } | 
| 339 | 339 | ||
| 340 | tkcor.push_back(tk); | 340 | tkcor.push_back(_tokenstore.add(tk)); | 
| 341 | } | 341 | } | 
| 342 | 342 | ||
| 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 
| @@ -377,9 +377,12 @@ void rawr::compile(int maxK) | |||
| 377 | emoticons.forms.compile(); | 377 | emoticons.forms.compile(); | 
| 378 | emoticons.terms.compile(); | 378 | emoticons.terms.compile(); | 
| 379 | 379 | ||
| 380 | // Compile the interned tokens. | ||
| 381 | _tokenstore.compile(); | ||
| 382 | |||
| 380 | // kgram distribution | 383 | // kgram distribution | 
| 381 | std::cout << "Creating markov chain... 0%" << std::flush; | 384 | std::cout << "Creating markov chain... 0%" << std::flush; | 
| 382 | std::map<kgram, std::map<token, token_data> > tstats; | 385 | std::map<kgram, std::map<token_id, token_data> > tstats; | 
| 383 | 386 | ||
| 384 | len = 0; | 387 | len = 0; | 
| 385 | for (auto c : tokens) | 388 | for (auto c : tokens) | 
| @@ -408,14 +411,15 @@ void rawr::compile(int maxK) | |||
| 408 | } | 411 | } | 
| 409 | 412 | ||
| 410 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); | 413 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); | 
| 411 | token f = corpus[i+k]; | 414 | token_id fid = corpus[i+k]; | 
| 415 | const token& f = _tokenstore.get(fid); | ||
| 412 | 416 | ||
| 413 | if (tstats[prefix].count(f) == 0) | 417 | if (tstats[prefix].count(fid) == 0) | 
| 414 | { | 418 | { | 
| 415 | tstats[prefix].emplace(f, f); | 419 | tstats[prefix].emplace(fid, fid); | 
| 416 | } | 420 | } | 
| 417 | 421 | ||
| 418 | token_data& td = tstats[prefix].at(f); | 422 | token_data& td = tstats[prefix].at(fid); | 
| 419 | td.all++; | 423 | td.all++; | 
| 420 | td.corpora.insert(corpid); | 424 | td.corpora.insert(corpid); | 
| 421 | 425 | ||
| @@ -426,19 +430,20 @@ void rawr::compile(int maxK) | |||
| 426 | { | 430 | { | 
| 427 | td.titlecase++; | 431 | td.titlecase++; | 
| 428 | } | 432 | } | 
| 429 | 433 | ||
| 430 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | 434 | const token& startTok = _tokenstore.get(std::begin(prefix)->tok); | 
| 435 | if (startTok.suffix == suffixtype::terminating) | ||
| 431 | { | 436 | { | 
| 432 | kgram term_prefix(prefix); | 437 | kgram term_prefix(prefix); | 
| 433 | term_prefix.pop_front(); | 438 | term_prefix.pop_front(); | 
| 434 | term_prefix.push_front(wildcardQuery); | 439 | term_prefix.push_front(wildcardQuery); | 
| 435 | 440 | ||
| 436 | if (tstats[term_prefix].count(f) == 0) | 441 | if (tstats[term_prefix].count(fid) == 0) | 
| 437 | { | 442 | { | 
| 438 | tstats[term_prefix].emplace(f, f); | 443 | tstats[term_prefix].emplace(fid, fid); | 
| 439 | } | 444 | } | 
| 440 | 445 | ||
| 441 | token_data& td2 = tstats[term_prefix].at(f); | 446 | token_data& td2 = tstats[term_prefix].at(fid); | 
| 442 | td2.all++; | 447 | td2.all++; | 
| 443 | td2.corpora.insert(corpid); | 448 | td2.corpora.insert(corpid); | 
| 444 | 449 | ||
| @@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const | |||
| 600 | int max = distribution.rbegin()->first; | 605 | int max = distribution.rbegin()->first; | 
| 601 | int r = rand() % max; | 606 | int r = rand() % max; | 
| 602 | const token_data& next = distribution.upper_bound(r)->second; | 607 | const token_data& next = distribution.upper_bound(r)->second; | 
| 603 | std::string nextToken = next.tok.w.forms.next(); | 608 | const token& interned = _tokenstore.get(next.tok); | 
| 604 | 609 | std::string nextToken = interned.w.forms.next(); | |
| 610 | |||
| 605 | // Apply user-specified transforms | 611 | // Apply user-specified transforms | 
| 606 | if (_transform) | 612 | if (_transform) | 
| 607 | { | 613 | { | 
| 608 | nextToken = _transform(next.tok.w.canon, nextToken); | 614 | nextToken = _transform(interned.w.canon, nextToken); | 
| 609 | } | 615 | } | 
| 610 | 616 | ||
| 611 | // Determine the casing of the next token. We randomly make the token all | 617 | // Determine the casing of the next token. We randomly make the token all | 
| @@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const | |||
| 615 | if (casing < next.uppercase) | 621 | if (casing < next.uppercase) | 
| 616 | { | 622 | { | 
| 617 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 623 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 
| 618 | } else if ((((cur.rbegin()->type == querytype::sentence) | 624 | } else { | 
| 619 | || ((cur.rbegin()->type == querytype::literal) | 625 | bool capitalize = false; | 
| 620 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | 626 | |
| 621 | && (rand() % 2 > 0)) | 627 | if (casing - next.uppercase < next.titlecase) | 
| 622 | || (casing - next.uppercase < next.titlecase)) | 628 | { | 
| 623 | { | 629 | capitalize = true; | 
| 624 | nextToken[0] = toupper(nextToken[0]); | 630 | } else if (cur.rbegin()->type == querytype::sentence) | 
| 631 | { | ||
| 632 | if (rand() % 2 > 0) | ||
| 633 | { | ||
| 634 | capitalize = true; | ||
| 635 | } | ||
| 636 | } else { | ||
| 637 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | ||
| 638 | |||
| 639 | if (lastTok.suffix == suffixtype::terminating && | ||
| 640 | rand() % 2 > 0) | ||
| 641 | { | ||
| 642 | capitalize = true; | ||
| 643 | } | ||
| 644 | } | ||
| 645 | |||
| 646 | if (capitalize) | ||
| 647 | { | ||
| 648 | nextToken[0] = toupper(nextToken[0]); | ||
| 649 | } | ||
| 625 | } | 650 | } | 
| 626 | 651 | ||
| 627 | // Delimiters | 652 | // Delimiters | 
| 628 | for (auto& dt : next.tok.delimiters) | 653 | for (auto& dt : interned.delimiters) | 
| 629 | { | 654 | { | 
| 630 | if (dt.first.status == doublestatus::both) | 655 | if (dt.first.status == doublestatus::both) | 
| 631 | { | 656 | { | 
| @@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const | |||
| 692 | } | 717 | } | 
| 693 | 718 | ||
| 694 | // Terminators | 719 | // Terminators | 
| 695 | if (next.tok.suffix == suffixtype::terminating) | 720 | if (interned.suffix == suffixtype::terminating) | 
| 696 | { | 721 | { | 
| 697 | auto term = next.tok.w.terms.next(); | 722 | auto term = interned.w.terms.next(); | 
| 698 | nextToken.append(term.form); | 723 | nextToken.append(term.form); | 
| 699 | 724 | ||
| 700 | if (term.newline) | 725 | if (term.newline) | 
| @@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 703 | } else { | 728 | } else { | 
| 704 | nextToken.append(" "); | 729 | nextToken.append(" "); | 
| 705 | } | 730 | } | 
| 706 | } else if (next.tok.suffix == suffixtype::comma) | 731 | } else if (interned.suffix == suffixtype::comma) | 
| 707 | { | 732 | { | 
| 708 | nextToken.append(", "); | 733 | nextToken.append(", "); | 
| 709 | } else { | 734 | } else { | 
| @@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const | |||
| 734 | 759 | ||
| 735 | cur.push_back(next.tok); | 760 | cur.push_back(next.tok); | 
| 736 | result.append(nextToken); | 761 | result.append(nextToken); | 
| 737 | 762 | ||
| 738 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 763 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 
| 739 | { | 764 | { | 
| 740 | break; | 765 | break; | 
| 741 | } | 766 | } | 
