diff options
| -rw-r--r-- | identifier.h | 59 | ||||
| -rw-r--r-- | kgramstats.cpp | 95 | ||||
| -rw-r--r-- | kgramstats.h | 23 |
3 files changed, 133 insertions, 44 deletions
| diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h | |||
| @@ -0,0 +1,59 @@ | |||
| 1 | #ifndef IDENTIFIER_H_D7EE5679 | ||
| 2 | #define IDENTIFIER_H_D7EE5679 | ||
| 3 | |||
| 4 | #include <map> | ||
| 5 | #include <vector> | ||
| 6 | |||
| 7 | template <typename T> | ||
| 8 | class identifier { | ||
| 9 | public: | ||
| 10 | |||
| 11 | using value_type = T; | ||
| 12 | |||
| 13 | private: | ||
| 14 | |||
| 15 | using vector_type = std::vector<value_type>; | ||
| 16 | |||
| 17 | public: | ||
| 18 | |||
| 19 | using key_type = typename vector_type::size_type; | ||
| 20 | |||
| 21 | key_type add(const value_type& val) | ||
| 22 | { | ||
| 23 | auto it = ids_.find(val); | ||
| 24 | |||
| 25 | if (it == std::end(ids_)) | ||
| 26 | { | ||
| 27 | key_type ret = ids_.size(); | ||
| 28 | ids_[val] = ret; | ||
| 29 | |||
| 30 | uniq_.push_back(val); | ||
| 31 | |||
| 32 | return ret; | ||
| 33 | } else { | ||
| 34 | return it->second; | ||
| 35 | } | ||
| 36 | } | ||
| 37 | |||
| 38 | void compile() | ||
| 39 | { | ||
| 40 | ids_.clear(); | ||
| 41 | } | ||
| 42 | |||
| 43 | inline const value_type& get(key_type i) const | ||
| 44 | { | ||
| 45 | return uniq_.at(i); | ||
| 46 | } | ||
| 47 | |||
| 48 | inline key_type size() const | ||
| 49 | { | ||
| 50 | return uniq_.size(); | ||
| 51 | } | ||
| 52 | |||
| 53 | private: | ||
| 54 | |||
| 55 | std::map<value_type, key_type> ids_; | ||
| 56 | vector_type uniq_; | ||
| 57 | }; | ||
| 58 | |||
| 59 | #endif /* end of include guard: IDENTIFIER_H_D7EE5679 */ | ||
| diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus) | |||
| 55 | void rawr::compile(int maxK) | 55 | void rawr::compile(int maxK) |
| 56 | { | 56 | { |
| 57 | _maxK = maxK; | 57 | _maxK = maxK; |
| 58 | 58 | ||
| 59 | std::vector<std::vector<token>> tokens; | 59 | std::vector<std::vector<token_id>> tokens; |
| 60 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; |
| 61 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; |
| 62 | 62 | ||
| @@ -120,8 +120,8 @@ void rawr::compile(int maxK) | |||
| 120 | { | 120 | { |
| 121 | size_t start = 0; | 121 | size_t start = 0; |
| 122 | int end = 0; | 122 | int end = 0; |
| 123 | std::vector<token> tkcor; | 123 | std::vector<token_id> tkcor; |
| 124 | 124 | ||
| 125 | while (end != std::string::npos) | 125 | while (end != std::string::npos) |
| 126 | { | 126 | { |
| 127 | perprime = (startper + end) * 100 / len; | 127 | perprime = (startper + end) * 100 / len; |
| @@ -336,8 +336,8 @@ void rawr::compile(int maxK) | |||
| 336 | } | 336 | } |
| 337 | } | 337 | } |
| 338 | } | 338 | } |
| 339 | 339 | ||
| 340 | tkcor.push_back(tk); | 340 | tkcor.push_back(_tokenstore.add(tk)); |
| 341 | } | 341 | } |
| 342 | 342 | ||
| 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 343 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| @@ -377,9 +377,12 @@ void rawr::compile(int maxK) | |||
| 377 | emoticons.forms.compile(); | 377 | emoticons.forms.compile(); |
| 378 | emoticons.terms.compile(); | 378 | emoticons.terms.compile(); |
| 379 | 379 | ||
| 380 | // Compile the interned tokens. | ||
| 381 | _tokenstore.compile(); | ||
| 382 | |||
| 380 | // kgram distribution | 383 | // kgram distribution |
| 381 | std::cout << "Creating markov chain... 0%" << std::flush; | 384 | std::cout << "Creating markov chain... 0%" << std::flush; |
| 382 | std::map<kgram, std::map<token, token_data> > tstats; | 385 | std::map<kgram, std::map<token_id, token_data> > tstats; |
| 383 | 386 | ||
| 384 | len = 0; | 387 | len = 0; |
| 385 | for (auto c : tokens) | 388 | for (auto c : tokens) |
| @@ -408,14 +411,15 @@ void rawr::compile(int maxK) | |||
| 408 | } | 411 | } |
| 409 | 412 | ||
| 410 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); | 413 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); |
| 411 | token f = corpus[i+k]; | 414 | token_id fid = corpus[i+k]; |
| 415 | const token& f = _tokenstore.get(fid); | ||
| 412 | 416 | ||
| 413 | if (tstats[prefix].count(f) == 0) | 417 | if (tstats[prefix].count(fid) == 0) |
| 414 | { | 418 | { |
| 415 | tstats[prefix].emplace(f, f); | 419 | tstats[prefix].emplace(fid, fid); |
| 416 | } | 420 | } |
| 417 | 421 | ||
| 418 | token_data& td = tstats[prefix].at(f); | 422 | token_data& td = tstats[prefix].at(fid); |
| 419 | td.all++; | 423 | td.all++; |
| 420 | td.corpora.insert(corpid); | 424 | td.corpora.insert(corpid); |
| 421 | 425 | ||
| @@ -426,19 +430,20 @@ void rawr::compile(int maxK) | |||
| 426 | { | 430 | { |
| 427 | td.titlecase++; | 431 | td.titlecase++; |
| 428 | } | 432 | } |
| 429 | 433 | ||
| 430 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | 434 | const token& startTok = _tokenstore.get(std::begin(prefix)->tok); |
| 435 | if (startTok.suffix == suffixtype::terminating) | ||
| 431 | { | 436 | { |
| 432 | kgram term_prefix(prefix); | 437 | kgram term_prefix(prefix); |
| 433 | term_prefix.pop_front(); | 438 | term_prefix.pop_front(); |
| 434 | term_prefix.push_front(wildcardQuery); | 439 | term_prefix.push_front(wildcardQuery); |
| 435 | 440 | ||
| 436 | if (tstats[term_prefix].count(f) == 0) | 441 | if (tstats[term_prefix].count(fid) == 0) |
| 437 | { | 442 | { |
| 438 | tstats[term_prefix].emplace(f, f); | 443 | tstats[term_prefix].emplace(fid, fid); |
| 439 | } | 444 | } |
| 440 | 445 | ||
| 441 | token_data& td2 = tstats[term_prefix].at(f); | 446 | token_data& td2 = tstats[term_prefix].at(fid); |
| 442 | td2.all++; | 447 | td2.all++; |
| 443 | td2.corpora.insert(corpid); | 448 | td2.corpora.insert(corpid); |
| 444 | 449 | ||
| @@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const | |||
| 600 | int max = distribution.rbegin()->first; | 605 | int max = distribution.rbegin()->first; |
| 601 | int r = rand() % max; | 606 | int r = rand() % max; |
| 602 | const token_data& next = distribution.upper_bound(r)->second; | 607 | const token_data& next = distribution.upper_bound(r)->second; |
| 603 | std::string nextToken = next.tok.w.forms.next(); | 608 | const token& interned = _tokenstore.get(next.tok); |
| 604 | 609 | std::string nextToken = interned.w.forms.next(); | |
| 610 | |||
| 605 | // Apply user-specified transforms | 611 | // Apply user-specified transforms |
| 606 | if (_transform) | 612 | if (_transform) |
| 607 | { | 613 | { |
| 608 | nextToken = _transform(next.tok.w.canon, nextToken); | 614 | nextToken = _transform(interned.w.canon, nextToken); |
| 609 | } | 615 | } |
| 610 | 616 | ||
| 611 | // Determine the casing of the next token. We randomly make the token all | 617 | // Determine the casing of the next token. We randomly make the token all |
| @@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const | |||
| 615 | if (casing < next.uppercase) | 621 | if (casing < next.uppercase) |
| 616 | { | 622 | { |
| 617 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 623 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
| 618 | } else if ((((cur.rbegin()->type == querytype::sentence) | 624 | } else { |
| 619 | || ((cur.rbegin()->type == querytype::literal) | 625 | bool capitalize = false; |
| 620 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | 626 | |
| 621 | && (rand() % 2 > 0)) | 627 | if (casing - next.uppercase < next.titlecase) |
| 622 | || (casing - next.uppercase < next.titlecase)) | 628 | { |
| 623 | { | 629 | capitalize = true; |
| 624 | nextToken[0] = toupper(nextToken[0]); | 630 | } else if (cur.rbegin()->type == querytype::sentence) |
| 631 | { | ||
| 632 | if (rand() % 2 > 0) | ||
| 633 | { | ||
| 634 | capitalize = true; | ||
| 635 | } | ||
| 636 | } else { | ||
| 637 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | ||
| 638 | |||
| 639 | if (lastTok.suffix == suffixtype::terminating && | ||
| 640 | rand() % 2 > 0) | ||
| 641 | { | ||
| 642 | capitalize = true; | ||
| 643 | } | ||
| 644 | } | ||
| 645 | |||
| 646 | if (capitalize) | ||
| 647 | { | ||
| 648 | nextToken[0] = toupper(nextToken[0]); | ||
| 649 | } | ||
| 625 | } | 650 | } |
| 626 | 651 | ||
| 627 | // Delimiters | 652 | // Delimiters |
| 628 | for (auto& dt : next.tok.delimiters) | 653 | for (auto& dt : interned.delimiters) |
| 629 | { | 654 | { |
| 630 | if (dt.first.status == doublestatus::both) | 655 | if (dt.first.status == doublestatus::both) |
| 631 | { | 656 | { |
| @@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const | |||
| 692 | } | 717 | } |
| 693 | 718 | ||
| 694 | // Terminators | 719 | // Terminators |
| 695 | if (next.tok.suffix == suffixtype::terminating) | 720 | if (interned.suffix == suffixtype::terminating) |
| 696 | { | 721 | { |
| 697 | auto term = next.tok.w.terms.next(); | 722 | auto term = interned.w.terms.next(); |
| 698 | nextToken.append(term.form); | 723 | nextToken.append(term.form); |
| 699 | 724 | ||
| 700 | if (term.newline) | 725 | if (term.newline) |
| @@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 703 | } else { | 728 | } else { |
| 704 | nextToken.append(" "); | 729 | nextToken.append(" "); |
| 705 | } | 730 | } |
| 706 | } else if (next.tok.suffix == suffixtype::comma) | 731 | } else if (interned.suffix == suffixtype::comma) |
| 707 | { | 732 | { |
| 708 | nextToken.append(", "); | 733 | nextToken.append(", "); |
| 709 | } else { | 734 | } else { |
| @@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const | |||
| 734 | 759 | ||
| 735 | cur.push_back(next.tok); | 760 | cur.push_back(next.tok); |
| 736 | result.append(nextToken); | 761 | result.append(nextToken); |
| 737 | 762 | ||
| 738 | if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 763 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) |
| 739 | { | 764 | { |
| 740 | break; | 765 | break; |
| 741 | } | 766 | } |
| diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <list> | 6 | #include <list> |
| 7 | #include <vector> | 7 | #include <vector> |
| 8 | #include "histogram.h" | 8 | #include "histogram.h" |
| 9 | #include "identifier.h" | ||
| 9 | #include <functional> | 10 | #include <functional> |
| 10 | #include <set> | 11 | #include <set> |
| 11 | 12 | ||
| @@ -92,6 +93,9 @@ class rawr { | |||
| 92 | } | 93 | } |
| 93 | }; | 94 | }; |
| 94 | 95 | ||
| 96 | using tokenstore = identifier<token>; | ||
| 97 | using token_id = tokenstore::key_type; | ||
| 98 | |||
| 95 | enum class querytype { | 99 | enum class querytype { |
| 96 | literal, | 100 | literal, |
| 97 | sentence | 101 | sentence |
| @@ -99,12 +103,12 @@ class rawr { | |||
| 99 | 103 | ||
| 100 | struct query { | 104 | struct query { |
| 101 | querytype type; | 105 | querytype type; |
| 102 | token tok; | 106 | token_id tok; |
| 103 | 107 | ||
| 104 | query(token tok) : tok(tok), type(querytype::literal) {} | 108 | query(token_id tok) : tok(tok), type(querytype::literal) {} |
| 105 | 109 | ||
| 106 | query(querytype type) : tok(blank_word), type(type) {} | 110 | query(querytype type) : tok(0), type(type) {} |
| 107 | 111 | ||
| 108 | bool operator<(const query& other) const | 112 | bool operator<(const query& other) const |
| 109 | { | 113 | { |
| 110 | if (type == other.type) | 114 | if (type == other.type) |
| @@ -126,10 +130,10 @@ class rawr { | |||
| 126 | int all; | 130 | int all; |
| 127 | int titlecase; | 131 | int titlecase; |
| 128 | int uppercase; | 132 | int uppercase; |
| 129 | token tok; | 133 | token_id tok; |
| 130 | std::set<int> corpora; | 134 | std::set<int> corpora; |
| 131 | 135 | ||
| 132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 136 | token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
| 133 | }; | 137 | }; |
| 134 | 138 | ||
| 135 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 139 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
| @@ -140,6 +144,7 @@ class rawr { | |||
| 140 | int _maxK; | 144 | int _maxK; |
| 141 | bool _compiled = false; | 145 | bool _compiled = false; |
| 142 | std::vector<std::string> _corpora; | 146 | std::vector<std::string> _corpora; |
| 147 | tokenstore _tokenstore; | ||
| 143 | std::map<kgram, std::map<int, token_data>> _stats; | 148 | std::map<kgram, std::map<int, token_data>> _stats; |
| 144 | transform_callback _transform; | 149 | transform_callback _transform; |
| 145 | int _min_corpora = 1; | 150 | int _min_corpora = 1; |
