about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
commitd75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree013285ad6ff9c7d2c2c3174eef99b89485917756
parent26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
downloadrawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip
Interned tokens to reduce memory footprint
-rw-r--r--identifier.h59
-rw-r--r--kgramstats.cpp95
-rw-r--r--kgramstats.h23
3 files changed, 133 insertions, 44 deletions
diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h
@@ -0,0 +1,59 @@
1#ifndef IDENTIFIER_H_D7EE5679
2#define IDENTIFIER_H_D7EE5679
3
4#include <map>
5#include <vector>
6
7template <typename T>
8class identifier {
9public:
10
11 using value_type = T;
12
13private:
14
15 using vector_type = std::vector<value_type>;
16
17public:
18
19 using key_type = typename vector_type::size_type;
20
21 key_type add(const value_type& val)
22 {
23 auto it = ids_.find(val);
24
25 if (it == std::end(ids_))
26 {
27 key_type ret = ids_.size();
28 ids_[val] = ret;
29
30 uniq_.push_back(val);
31
32 return ret;
33 } else {
34 return it->second;
35 }
36 }
37
38 void compile()
39 {
40 ids_.clear();
41 }
42
43 inline const value_type& get(key_type i) const
44 {
45 return uniq_.at(i);
46 }
47
48 inline key_type size() const
49 {
50 return uniq_.size();
51 }
52
53private:
54
55 std::map<value_type, key_type> ids_;
56 vector_type uniq_;
57};
58
59#endif /* end of include guard: IDENTIFIER_H_D7EE5679 */
diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
55void rawr::compile(int maxK) 55void rawr::compile(int maxK)
56{ 56{
57 _maxK = maxK; 57 _maxK = maxK;
58 58
59 std::vector<std::vector<token>> tokens; 59 std::vector<std::vector<token_id>> tokens;
60 std::set<std::string> thashtags; 60 std::set<std::string> thashtags;
61 std::set<std::string> fv_emoticons; 61 std::set<std::string> fv_emoticons;
62 62
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
120 { 120 {
121 size_t start = 0; 121 size_t start = 0;
122 int end = 0; 122 int end = 0;
123 std::vector<token> tkcor; 123 std::vector<token_id> tkcor;
124 124
125 while (end != std::string::npos) 125 while (end != std::string::npos)
126 { 126 {
127 perprime = (startper + end) * 100 / len; 127 perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
336 } 336 }
337 } 337 }
338 } 338 }
339 339
340 tkcor.push_back(tk); 340 tkcor.push_back(_tokenstore.add(tk));
341 } 341 }
342 342
343 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 343 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
377 emoticons.forms.compile(); 377 emoticons.forms.compile();
378 emoticons.terms.compile(); 378 emoticons.terms.compile();
379 379
380 // Compile the interned tokens.
381 _tokenstore.compile();
382
380 // kgram distribution 383 // kgram distribution
381 std::cout << "Creating markov chain... 0%" << std::flush; 384 std::cout << "Creating markov chain... 0%" << std::flush;
382 std::map<kgram, std::map<token, token_data> > tstats; 385 std::map<kgram, std::map<token_id, token_data> > tstats;
383 386
384 len = 0; 387 len = 0;
385 for (auto c : tokens) 388 for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
408 } 411 }
409 412
410 kgram prefix(corpus.begin()+i, corpus.begin()+i+k); 413 kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
411 token f = corpus[i+k]; 414 token_id fid = corpus[i+k];
415 const token& f = _tokenstore.get(fid);
412 416
413 if (tstats[prefix].count(f) == 0) 417 if (tstats[prefix].count(fid) == 0)
414 { 418 {
415 tstats[prefix].emplace(f, f); 419 tstats[prefix].emplace(fid, fid);
416 } 420 }
417 421
418 token_data& td = tstats[prefix].at(f); 422 token_data& td = tstats[prefix].at(fid);
419 td.all++; 423 td.all++;
420 td.corpora.insert(corpid); 424 td.corpora.insert(corpid);
421 425
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
426 { 430 {
427 td.titlecase++; 431 td.titlecase++;
428 } 432 }
429 433
430 if (std::begin(prefix)->tok.suffix == suffixtype::terminating) 434 const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
435 if (startTok.suffix == suffixtype::terminating)
431 { 436 {
432 kgram term_prefix(prefix); 437 kgram term_prefix(prefix);
433 term_prefix.pop_front(); 438 term_prefix.pop_front();
434 term_prefix.push_front(wildcardQuery); 439 term_prefix.push_front(wildcardQuery);
435 440
436 if (tstats[term_prefix].count(f) == 0) 441 if (tstats[term_prefix].count(fid) == 0)
437 { 442 {
438 tstats[term_prefix].emplace(f, f); 443 tstats[term_prefix].emplace(fid, fid);
439 } 444 }
440 445
441 token_data& td2 = tstats[term_prefix].at(f); 446 token_data& td2 = tstats[term_prefix].at(fid);
442 td2.all++; 447 td2.all++;
443 td2.corpora.insert(corpid); 448 td2.corpora.insert(corpid);
444 449
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
600 int max = distribution.rbegin()->first; 605 int max = distribution.rbegin()->first;
601 int r = rand() % max; 606 int r = rand() % max;
602 const token_data& next = distribution.upper_bound(r)->second; 607 const token_data& next = distribution.upper_bound(r)->second;
603 std::string nextToken = next.tok.w.forms.next(); 608 const token& interned = _tokenstore.get(next.tok);
604 609 std::string nextToken = interned.w.forms.next();
610
605 // Apply user-specified transforms 611 // Apply user-specified transforms
606 if (_transform) 612 if (_transform)
607 { 613 {
608 nextToken = _transform(next.tok.w.canon, nextToken); 614 nextToken = _transform(interned.w.canon, nextToken);
609 } 615 }
610 616
611 // Determine the casing of the next token. We randomly make the token all 617 // Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
615 if (casing < next.uppercase) 621 if (casing < next.uppercase)
616 { 622 {
617 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 623 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
618 } else if ((((cur.rbegin()->type == querytype::sentence) 624 } else {
619 || ((cur.rbegin()->type == querytype::literal) 625 bool capitalize = false;
620 && (cur.rbegin()->tok.suffix == suffixtype::terminating))) 626
621 && (rand() % 2 > 0)) 627 if (casing - next.uppercase < next.titlecase)
622 || (casing - next.uppercase < next.titlecase)) 628 {
623 { 629 capitalize = true;
624 nextToken[0] = toupper(nextToken[0]); 630 } else if (cur.rbegin()->type == querytype::sentence)
631 {
632 if (rand() % 2 > 0)
633 {
634 capitalize = true;
635 }
636 } else {
637 const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
638
639 if (lastTok.suffix == suffixtype::terminating &&
640 rand() % 2 > 0)
641 {
642 capitalize = true;
643 }
644 }
645
646 if (capitalize)
647 {
648 nextToken[0] = toupper(nextToken[0]);
649 }
625 } 650 }
626 651
627 // Delimiters 652 // Delimiters
628 for (auto& dt : next.tok.delimiters) 653 for (auto& dt : interned.delimiters)
629 { 654 {
630 if (dt.first.status == doublestatus::both) 655 if (dt.first.status == doublestatus::both)
631 { 656 {
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
692 } 717 }
693 718
694 // Terminators 719 // Terminators
695 if (next.tok.suffix == suffixtype::terminating) 720 if (interned.suffix == suffixtype::terminating)
696 { 721 {
697 auto term = next.tok.w.terms.next(); 722 auto term = interned.w.terms.next();
698 nextToken.append(term.form); 723 nextToken.append(term.form);
699 724
700 if (term.newline) 725 if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
703 } else { 728 } else {
704 nextToken.append(" "); 729 nextToken.append(" ");
705 } 730 }
706 } else if (next.tok.suffix == suffixtype::comma) 731 } else if (interned.suffix == suffixtype::comma)
707 { 732 {
708 nextToken.append(", "); 733 nextToken.append(", ");
709 } else { 734 } else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
734 759
735 cur.push_back(next.tok); 760 cur.push_back(next.tok);
736 result.append(nextToken); 761 result.append(nextToken);
737 762
738 if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) 763 if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
739 { 764 {
740 break; 765 break;
741 } 766 }
diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -6,6 +6,7 @@
6#include <list> 6#include <list>
7#include <vector> 7#include <vector>
8#include "histogram.h" 8#include "histogram.h"
9#include "identifier.h"
9#include <functional> 10#include <functional>
10#include <set> 11#include <set>
11 12
@@ -92,6 +93,9 @@ class rawr {
92 } 93 }
93 }; 94 };
94 95
96 using tokenstore = identifier<token>;
97 using token_id = tokenstore::key_type;
98
95 enum class querytype { 99 enum class querytype {
96 literal, 100 literal,
97 sentence 101 sentence
@@ -99,12 +103,12 @@ class rawr {
99 103
100 struct query { 104 struct query {
101 querytype type; 105 querytype type;
102 token tok; 106 token_id tok;
103 107
104 query(token tok) : tok(tok), type(querytype::literal) {} 108 query(token_id tok) : tok(tok), type(querytype::literal) {}
105 109
106 query(querytype type) : tok(blank_word), type(type) {} 110 query(querytype type) : tok(0), type(type) {}
107 111
108 bool operator<(const query& other) const 112 bool operator<(const query& other) const
109 { 113 {
110 if (type == other.type) 114 if (type == other.type)
@@ -126,10 +130,10 @@ class rawr {
126 int all; 130 int all;
127 int titlecase; 131 int titlecase;
128 int uppercase; 132 int uppercase;
129 token tok; 133 token_id tok;
130 std::set<int> corpora; 134 std::set<int> corpora;
131 135
132 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 136 token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
133 }; 137 };
134 138
135 friend std::ostream& operator<<(std::ostream& os, kgram k); 139 friend std::ostream& operator<<(std::ostream& os, kgram k);
@@ -140,6 +144,7 @@ class rawr {
140 int _maxK; 144 int _maxK;
141 bool _compiled = false; 145 bool _compiled = false;
142 std::vector<std::string> _corpora; 146 std::vector<std::string> _corpora;
147 tokenstore _tokenstore;
143 std::map<kgram, std::map<int, token_data>> _stats; 148 std::map<kgram, std::map<int, token_data>> _stats;
144 transform_callback _transform; 149 transform_callback _transform;
145 int _min_corpora = 1; 150 int _min_corpora = 1;