diff options
-rw-r--r-- | identifier.h | 59 | ||||
-rw-r--r-- | kgramstats.cpp | 95 | ||||
-rw-r--r-- | kgramstats.h | 23 |
3 files changed, 133 insertions, 44 deletions
diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h | |||
@@ -0,0 +1,59 @@ | |||
1 | #ifndef IDENTIFIER_H_D7EE5679 | ||
2 | #define IDENTIFIER_H_D7EE5679 | ||
3 | |||
4 | #include <map> | ||
5 | #include <vector> | ||
6 | |||
7 | template <typename T> | ||
8 | class identifier { | ||
9 | public: | ||
10 | |||
11 | using value_type = T; | ||
12 | |||
13 | private: | ||
14 | |||
15 | using vector_type = std::vector<value_type>; | ||
16 | |||
17 | public: | ||
18 | |||
19 | using key_type = typename vector_type::size_type; | ||
20 | |||
21 | key_type add(const value_type& val) | ||
22 | { | ||
23 | auto it = ids_.find(val); | ||
24 | |||
25 | if (it == std::end(ids_)) | ||
26 | { | ||
27 | key_type ret = ids_.size(); | ||
28 | ids_[val] = ret; | ||
29 | |||
30 | uniq_.push_back(val); | ||
31 | |||
32 | return ret; | ||
33 | } else { | ||
34 | return it->second; | ||
35 | } | ||
36 | } | ||
37 | |||
38 | void compile() | ||
39 | { | ||
40 | ids_.clear(); | ||
41 | } | ||
42 | |||
43 | inline const value_type& get(key_type i) const | ||
44 | { | ||
45 | return uniq_.at(i); | ||
46 | } | ||
47 | |||
48 | inline key_type size() const | ||
49 | { | ||
50 | return uniq_.size(); | ||
51 | } | ||
52 | |||
53 | private: | ||
54 | |||
55 | std::map<value_type, key_type> ids_; | ||
56 | vector_type uniq_; | ||
57 | }; | ||
58 | |||
59 | #endif /* end of include guard: IDENTIFIER_H_D7EE5679 */ | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus) | |||
55 | void rawr::compile(int maxK) | 55 | void rawr::compile(int maxK) |
56 | { | 56 | { |
57 | _maxK = maxK; | 57 | _maxK = maxK; |
58 | 58 | ||
59 | std::vector<std::vector<token>> tokens; | 59 | std::vector<std::vector<token_id>> tokens; |
60 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; |
61 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; |
62 | 62 | ||
@@ -120,8 +120,8 @@ void rawr::compile(int maxK) | |||
120 | { | 120 | { |
121 | size_t start = 0; | 121 | size_t start = 0; |
122 | int end = 0; | 122 | int end = 0; |
123 | std::vector<token> tkcor; | 123 | std::vector<token_id> tkcor; |
124 | 124 | ||
125 | while (end != std::string::npos) | 125 | while (end != std::string::npos) |
126 | { | 126 | { |
127 | perprime = (startper + end) * 100 / len; | 127 | perprime = (startper + end) * 100 / len; |
@@ -336,8 +336,8 @@ void rawr::compile(int maxK) | |||
336 | } | 336 | } |
337 | } | 337 | } |
338 | } | 338 | } |
339 | 339 | ||
340 | tkcor.push_back(tk); | 340 | tkcor.push_back(_tokenstore.add(tk)); |
341 | } | 341 | } |
342 | 342 | ||
343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
@@ -377,9 +377,12 @@ void rawr::compile(int maxK) | |||
377 | emoticons.forms.compile(); | 377 | emoticons.forms.compile(); |
378 | emoticons.terms.compile(); | 378 | emoticons.terms.compile(); |
379 | 379 | ||
380 | // Compile the interned tokens. | ||
381 | _tokenstore.compile(); | ||
382 | |||
380 | // kgram distribution | 383 | // kgram distribution |
381 | std::cout << "Creating markov chain... 0%" << std::flush; | 384 | std::cout << "Creating markov chain... 0%" << std::flush; |
382 | std::map<kgram, std::map<token, token_data> > tstats; | 385 | std::map<kgram, std::map<token_id, token_data> > tstats; |
383 | 386 | ||
384 | len = 0; | 387 | len = 0; |
385 | for (auto c : tokens) | 388 | for (auto c : tokens) |
@@ -408,14 +411,15 @@ void rawr::compile(int maxK) | |||
408 | } | 411 | } |
409 | 412 | ||
410 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); | 413 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); |
411 | token f = corpus[i+k]; | 414 | token_id fid = corpus[i+k]; |
415 | const token& f = _tokenstore.get(fid); | ||
412 | 416 | ||
413 | if (tstats[prefix].count(f) == 0) | 417 | if (tstats[prefix].count(fid) == 0) |
414 | { | 418 | { |
415 | tstats[prefix].emplace(f, f); | 419 | tstats[prefix].emplace(fid, fid); |
416 | } | 420 | } |
417 | 421 | ||
418 | token_data& td = tstats[prefix].at(f); | 422 | token_data& td = tstats[prefix].at(fid); |
419 | td.all++; | 423 | td.all++; |
420 | td.corpora.insert(corpid); | 424 | td.corpora.insert(corpid); |
421 | 425 | ||
@@ -426,19 +430,20 @@ void rawr::compile(int maxK) | |||
426 | { | 430 | { |
427 | td.titlecase++; | 431 | td.titlecase++; |
428 | } | 432 | } |
429 | 433 | ||
430 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | 434 | const token& startTok = _tokenstore.get(std::begin(prefix)->tok); |
435 | if (startTok.suffix == suffixtype::terminating) | ||
431 | { | 436 | { |
432 | kgram term_prefix(prefix); | 437 | kgram term_prefix(prefix); |
433 | term_prefix.pop_front(); | 438 | term_prefix.pop_front(); |
434 | term_prefix.push_front(wildcardQuery); | 439 | term_prefix.push_front(wildcardQuery); |
435 | 440 | ||
436 | if (tstats[term_prefix].count(f) == 0) | 441 | if (tstats[term_prefix].count(fid) == 0) |
437 | { | 442 | { |
438 | tstats[term_prefix].emplace(f, f); | 443 | tstats[term_prefix].emplace(fid, fid); |
439 | } | 444 | } |
440 | 445 | ||
441 | token_data& td2 = tstats[term_prefix].at(f); | 446 | token_data& td2 = tstats[term_prefix].at(fid); |
442 | td2.all++; | 447 | td2.all++; |
443 | td2.corpora.insert(corpid); | 448 | td2.corpora.insert(corpid); |
444 | 449 | ||
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const | |||
600 | int max = distribution.rbegin()->first; | 605 | int max = distribution.rbegin()->first; |
601 | int r = rand() % max; | 606 | int r = rand() % max; |
602 | const token_data& next = distribution.upper_bound(r)->second; | 607 | const token_data& next = distribution.upper_bound(r)->second; |
603 | std::string nextToken = next.tok.w.forms.next(); | 608 | const token& interned = _tokenstore.get(next.tok); |
604 | 609 | std::string nextToken = interned.w.forms.next(); | |
610 | |||
605 | // Apply user-specified transforms | 611 | // Apply user-specified transforms |
606 | if (_transform) | 612 | if (_transform) |
607 | { | 613 | { |
608 | nextToken = _transform(next.tok.w.canon, nextToken); | 614 | nextToken = _transform(interned.w.canon, nextToken); |
609 | } | 615 | } |
610 | 616 | ||
611 | // Determine the casing of the next token. We randomly make the token all | 617 | // Determine the casing of the next token. We randomly make the token all |
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const | |||
615 | if (casing < next.uppercase) | 621 | if (casing < next.uppercase) |
616 | { | 622 | { |
617 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 623 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
618 | } else if ((((cur.rbegin()->type == querytype::sentence) | 624 | } else { |
619 | || ((cur.rbegin()->type == querytype::literal) | 625 | bool capitalize = false; |
620 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | 626 | |
621 | && (rand() % 2 > 0)) | 627 | if (casing - next.uppercase < next.titlecase) |
622 | || (casing - next.uppercase < next.titlecase)) | 628 | { |
623 | { | 629 | capitalize = true; |
624 | nextToken[0] = toupper(nextToken[0]); | 630 | } else if (cur.rbegin()->type == querytype::sentence) |
631 | { | ||
632 | if (rand() % 2 > 0) | ||
633 | { | ||
634 | capitalize = true; | ||
635 | } | ||
636 | } else { | ||
637 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | ||
638 | |||
639 | if (lastTok.suffix == suffixtype::terminating && | ||
640 | rand() % 2 > 0) | ||
641 | { | ||
642 | capitalize = true; | ||
643 | } | ||
644 | } | ||
645 | |||
646 | if (capitalize) | ||
647 | { | ||
648 | nextToken[0] = toupper(nextToken[0]); | ||
649 | } | ||
625 | } | 650 | } |
626 | 651 | ||
627 | // Delimiters | 652 | // Delimiters |
628 | for (auto& dt : next.tok.delimiters) | 653 | for (auto& dt : interned.delimiters) |
629 | { | 654 | { |
630 | if (dt.first.status == doublestatus::both) | 655 | if (dt.first.status == doublestatus::both) |
631 | { | 656 | { |
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const | |||
692 | } | 717 | } |
693 | 718 | ||
694 | // Terminators | 719 | // Terminators |
695 | if (next.tok.suffix == suffixtype::terminating) | 720 | if (interned.suffix == suffixtype::terminating) |
696 | { | 721 | { |
697 | auto term = next.tok.w.terms.next(); | 722 | auto term = interned.w.terms.next(); |
698 | nextToken.append(term.form); | 723 | nextToken.append(term.form); |
699 | 724 | ||
700 | if (term.newline) | 725 | if (term.newline) |
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const | |||
703 | } else { | 728 | } else { |
704 | nextToken.append(" "); | 729 | nextToken.append(" "); |
705 | } | 730 | } |
706 | } else if (next.tok.suffix == suffixtype::comma) | 731 | } else if (interned.suffix == suffixtype::comma) |
707 | { | 732 | { |
708 | nextToken.append(", "); | 733 | nextToken.append(", "); |
709 | } else { | 734 | } else { |
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const | |||
734 | 759 | ||
735 | cur.push_back(next.tok); | 760 | cur.push_back(next.tok); |
736 | result.append(nextToken); | 761 | result.append(nextToken); |
737 | 762 | ||
738 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 763 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
739 | { | 764 | { |
740 | break; | 765 | break; |
741 | } | 766 | } |
diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <list> | 6 | #include <list> |
7 | #include <vector> | 7 | #include <vector> |
8 | #include "histogram.h" | 8 | #include "histogram.h" |
9 | #include "identifier.h" | ||
9 | #include <functional> | 10 | #include <functional> |
10 | #include <set> | 11 | #include <set> |
11 | 12 | ||
@@ -92,6 +93,9 @@ class rawr { | |||
92 | } | 93 | } |
93 | }; | 94 | }; |
94 | 95 | ||
96 | using tokenstore = identifier<token>; | ||
97 | using token_id = tokenstore::key_type; | ||
98 | |||
95 | enum class querytype { | 99 | enum class querytype { |
96 | literal, | 100 | literal, |
97 | sentence | 101 | sentence |
@@ -99,12 +103,12 @@ class rawr { | |||
99 | 103 | ||
100 | struct query { | 104 | struct query { |
101 | querytype type; | 105 | querytype type; |
102 | token tok; | 106 | token_id tok; |
103 | 107 | ||
104 | query(token tok) : tok(tok), type(querytype::literal) {} | 108 | query(token_id tok) : tok(tok), type(querytype::literal) {} |
105 | 109 | ||
106 | query(querytype type) : tok(blank_word), type(type) {} | 110 | query(querytype type) : tok(0), type(type) {} |
107 | 111 | ||
108 | bool operator<(const query& other) const | 112 | bool operator<(const query& other) const |
109 | { | 113 | { |
110 | if (type == other.type) | 114 | if (type == other.type) |
@@ -126,10 +130,10 @@ class rawr { | |||
126 | int all; | 130 | int all; |
127 | int titlecase; | 131 | int titlecase; |
128 | int uppercase; | 132 | int uppercase; |
129 | token tok; | 133 | token_id tok; |
130 | std::set<int> corpora; | 134 | std::set<int> corpora; |
131 | 135 | ||
132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 136 | token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
133 | }; | 137 | }; |
134 | 138 | ||
135 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 139 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
@@ -140,6 +144,7 @@ class rawr { | |||
140 | int _maxK; | 144 | int _maxK; |
141 | bool _compiled = false; | 145 | bool _compiled = false; |
142 | std::vector<std::string> _corpora; | 146 | std::vector<std::string> _corpora; |
147 | tokenstore _tokenstore; | ||
143 | std::map<kgram, std::map<int, token_data>> _stats; | 148 | std::map<kgram, std::map<int, token_data>> _stats; |
144 | transform_callback _transform; | 149 | transform_callback _transform; |
145 | int _min_corpora = 1; | 150 | int _min_corpora = 1; |