about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
commitd75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.cpp
parent26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
downloadrawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp95
1 files changed, 60 insertions, 35 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
55void rawr::compile(int maxK) 55void rawr::compile(int maxK)
56{ 56{
57 _maxK = maxK; 57 _maxK = maxK;
58 58
59 std::vector<std::vector<token>> tokens; 59 std::vector<std::vector<token_id>> tokens;
60 std::set<std::string> thashtags; 60 std::set<std::string> thashtags;
61 std::set<std::string> fv_emoticons; 61 std::set<std::string> fv_emoticons;
62 62
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
120 { 120 {
121 size_t start = 0; 121 size_t start = 0;
122 int end = 0; 122 int end = 0;
123 std::vector<token> tkcor; 123 std::vector<token_id> tkcor;
124 124
125 while (end != std::string::npos) 125 while (end != std::string::npos)
126 { 126 {
127 perprime = (startper + end) * 100 / len; 127 perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
336 } 336 }
337 } 337 }
338 } 338 }
339 339
340 tkcor.push_back(tk); 340 tkcor.push_back(_tokenstore.add(tk));
341 } 341 }
342 342
343 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 343 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
377 emoticons.forms.compile(); 377 emoticons.forms.compile();
378 emoticons.terms.compile(); 378 emoticons.terms.compile();
379 379
380 // Compile the interned tokens.
381 _tokenstore.compile();
382
380 // kgram distribution 383 // kgram distribution
381 std::cout << "Creating markov chain... 0%" << std::flush; 384 std::cout << "Creating markov chain... 0%" << std::flush;
382 std::map<kgram, std::map<token, token_data> > tstats; 385 std::map<kgram, std::map<token_id, token_data> > tstats;
383 386
384 len = 0; 387 len = 0;
385 for (auto c : tokens) 388 for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
408 } 411 }
409 412
410 kgram prefix(corpus.begin()+i, corpus.begin()+i+k); 413 kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
411 token f = corpus[i+k]; 414 token_id fid = corpus[i+k];
415 const token& f = _tokenstore.get(fid);
412 416
413 if (tstats[prefix].count(f) == 0) 417 if (tstats[prefix].count(fid) == 0)
414 { 418 {
415 tstats[prefix].emplace(f, f); 419 tstats[prefix].emplace(fid, fid);
416 } 420 }
417 421
418 token_data& td = tstats[prefix].at(f); 422 token_data& td = tstats[prefix].at(fid);
419 td.all++; 423 td.all++;
420 td.corpora.insert(corpid); 424 td.corpora.insert(corpid);
421 425
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
426 { 430 {
427 td.titlecase++; 431 td.titlecase++;
428 } 432 }
429 433
430 if (std::begin(prefix)->tok.suffix == suffixtype::terminating) 434 const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
435 if (startTok.suffix == suffixtype::terminating)
431 { 436 {
432 kgram term_prefix(prefix); 437 kgram term_prefix(prefix);
433 term_prefix.pop_front(); 438 term_prefix.pop_front();
434 term_prefix.push_front(wildcardQuery); 439 term_prefix.push_front(wildcardQuery);
435 440
436 if (tstats[term_prefix].count(f) == 0) 441 if (tstats[term_prefix].count(fid) == 0)
437 { 442 {
438 tstats[term_prefix].emplace(f, f); 443 tstats[term_prefix].emplace(fid, fid);
439 } 444 }
440 445
441 token_data& td2 = tstats[term_prefix].at(f); 446 token_data& td2 = tstats[term_prefix].at(fid);
442 td2.all++; 447 td2.all++;
443 td2.corpora.insert(corpid); 448 td2.corpora.insert(corpid);
444 449
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
600 int max = distribution.rbegin()->first; 605 int max = distribution.rbegin()->first;
601 int r = rand() % max; 606 int r = rand() % max;
602 const token_data& next = distribution.upper_bound(r)->second; 607 const token_data& next = distribution.upper_bound(r)->second;
603 std::string nextToken = next.tok.w.forms.next(); 608 const token& interned = _tokenstore.get(next.tok);
604 609 std::string nextToken = interned.w.forms.next();
610
605 // Apply user-specified transforms 611 // Apply user-specified transforms
606 if (_transform) 612 if (_transform)
607 { 613 {
608 nextToken = _transform(next.tok.w.canon, nextToken); 614 nextToken = _transform(interned.w.canon, nextToken);
609 } 615 }
610 616
611 // Determine the casing of the next token. We randomly make the token all 617 // Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
615 if (casing < next.uppercase) 621 if (casing < next.uppercase)
616 { 622 {
617 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 623 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
618 } else if ((((cur.rbegin()->type == querytype::sentence) 624 } else {
619 || ((cur.rbegin()->type == querytype::literal) 625 bool capitalize = false;
620 && (cur.rbegin()->tok.suffix == suffixtype::terminating))) 626
621 && (rand() % 2 > 0)) 627 if (casing - next.uppercase < next.titlecase)
622 || (casing - next.uppercase < next.titlecase)) 628 {
623 { 629 capitalize = true;
624 nextToken[0] = toupper(nextToken[0]); 630 } else if (cur.rbegin()->type == querytype::sentence)
631 {
632 if (rand() % 2 > 0)
633 {
634 capitalize = true;
635 }
636 } else {
637 const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
638
639 if (lastTok.suffix == suffixtype::terminating &&
640 rand() % 2 > 0)
641 {
642 capitalize = true;
643 }
644 }
645
646 if (capitalize)
647 {
648 nextToken[0] = toupper(nextToken[0]);
649 }
625 } 650 }
626 651
627 // Delimiters 652 // Delimiters
628 for (auto& dt : next.tok.delimiters) 653 for (auto& dt : interned.delimiters)
629 { 654 {
630 if (dt.first.status == doublestatus::both) 655 if (dt.first.status == doublestatus::both)
631 { 656 {
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
692 } 717 }
693 718
694 // Terminators 719 // Terminators
695 if (next.tok.suffix == suffixtype::terminating) 720 if (interned.suffix == suffixtype::terminating)
696 { 721 {
697 auto term = next.tok.w.terms.next(); 722 auto term = interned.w.terms.next();
698 nextToken.append(term.form); 723 nextToken.append(term.form);
699 724
700 if (term.newline) 725 if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
703 } else { 728 } else {
704 nextToken.append(" "); 729 nextToken.append(" ");
705 } 730 }
706 } else if (next.tok.suffix == suffixtype::comma) 731 } else if (interned.suffix == suffixtype::comma)
707 { 732 {
708 nextToken.append(", "); 733 nextToken.append(", ");
709 } else { 734 } else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
734 759
735 cur.push_back(next.tok); 760 cur.push_back(next.tok);
736 result.append(nextToken); 761 result.append(nextToken);
737 762
738 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) 763 if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
739 { 764 {
740 break; 765 break;
741 } 766 }