diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-01 09:30:04 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-01 09:30:04 -0500 |
| commit | 617155fe562652c859a380d85cc5710783d79448 (patch) | |
| tree | f5eee89b0fa4b3c9dfe7187ca78916a71b59045e /kgramstats.cpp | |
| parent | b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (diff) | |
| download | rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.gz rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.bz2 rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.zip | |
Added emoji freevar
Strings of emojis are tokenized separately from anything else, and added to an emoticon freevar, which is mixed in with regular emoticons like :P. This breaks old-style freevars like $name$ and $noun$ so some legacy support for compatibility is left in but eventually $name$ should be made into an actual new freevar. Emoji data is from gemoji (https://github.com/github/gemoji).
Diffstat (limited to 'kgramstats.cpp')
| -rw-r--r-- | kgramstats.cpp | 105 |
1 files changed, 102 insertions, 3 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 0ab0c99..5b571d6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -39,6 +39,9 @@ | |||
| 39 | #include <algorithm> | 39 | #include <algorithm> |
| 40 | #include <set> | 40 | #include <set> |
| 41 | #include <stack> | 41 | #include <stack> |
| 42 | #include "freevars.h" | ||
| 43 | #include <fstream> | ||
| 44 | #include "prefix_search.h" | ||
| 42 | 45 | ||
| 43 | query wildcardQuery {querytype::sentence}; | 46 | query wildcardQuery {querytype::sentence}; |
| 44 | word blank_word {""}; | 47 | word blank_word {""}; |
| @@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 53 | size_t start = 0; | 56 | size_t start = 0; |
| 54 | int end = 0; | 57 | int end = 0; |
| 55 | std::set<std::string> thashtags; | 58 | std::set<std::string> thashtags; |
| 59 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | ||
| 60 | |||
| 61 | std::cout << "Reading emojis..." << std::endl; | ||
| 62 | prefix_search emojis; | ||
| 63 | std::ifstream emoji_file("emojis.txt"); | ||
| 64 | if (emoji_file) | ||
| 65 | { | ||
| 66 | while (!emoji_file.eof()) | ||
| 67 | { | ||
| 68 | std::string rawmojis; | ||
| 69 | getline(emoji_file, rawmojis); | ||
| 70 | emojis.add(rawmojis); | ||
| 71 | } | ||
| 72 | |||
| 73 | emoji_file.close(); | ||
| 74 | } | ||
| 56 | 75 | ||
| 76 | std::cout << "Tokenizing corpus..." << std::endl; | ||
| 57 | while (end != std::string::npos) | 77 | while (end != std::string::npos) |
| 58 | { | 78 | { |
| 59 | end = corpus.find(" ", start); | 79 | end = corpus.find(" ", start); |
| 60 | 80 | ||
| 61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 81 | bool emoji = false; |
| 62 | if (t.compare("") && t.compare(".")) | 82 | std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
| 83 | std::string t = ""; | ||
| 84 | |||
| 85 | if (te.compare("") && te.compare(".")) | ||
| 63 | { | 86 | { |
| 87 | // Extract strings of emojis into their own tokens even if they're not space delimited | ||
| 88 | int m = emojis.match(te); | ||
| 89 | emoji = m > 0; | ||
| 90 | if (m == 0) m = 1; | ||
| 91 | t = te.substr(0,m); | ||
| 92 | te = te.substr(m); | ||
| 93 | |||
| 94 | while (!te.empty()) | ||
| 95 | { | ||
| 96 | m = emojis.match(te); | ||
| 97 | if (emoji == (m > 0)) | ||
| 98 | { | ||
| 99 | if (m == 0) m = 1; | ||
| 100 | t += te.substr(0,m); | ||
| 101 | te = te.substr(m); | ||
| 102 | } else { | ||
| 103 | end = start + t.length() - 1; | ||
| 104 | break; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 64 | std::string tc(t), canonical; | 108 | std::string tc(t), canonical; |
| 65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
| 66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | 110 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { |
| @@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 72 | if (canonical[0] == '#') | 116 | if (canonical[0] == '#') |
| 73 | { | 117 | { |
| 74 | thashtags.insert(canonical); | 118 | thashtags.insert(canonical); |
| 75 | canonical = "#hashtag"; | ||
| 76 | 119 | ||
| 77 | return hashtags; | 120 | return hashtags; |
| 78 | } | 121 | } |
| 79 | 122 | ||
| 123 | // Emoticon freevar | ||
| 124 | if (emoji) | ||
| 125 | { | ||
| 126 | emoticons.forms.add(canonical); | ||
| 127 | |||
| 128 | return emoticons; | ||
| 129 | } | ||
| 130 | |||
| 131 | std::string emoticon_canon; | ||
| 132 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { | ||
| 133 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
| 134 | }); | ||
| 135 | if (fv_emoticons.check(emoticon_canon)) | ||
| 136 | { | ||
| 137 | emoticons.forms.add(emoticon_canon); | ||
| 138 | |||
| 139 | return emoticons; | ||
| 140 | } | ||
| 141 | |||
| 80 | // Basically any other word | 142 | // Basically any other word |
| 81 | if (words.count(canonical) == 0) | 143 | if (words.count(canonical) == 0) |
| 82 | { | 144 | { |
| @@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 171 | } | 233 | } |
| 172 | 234 | ||
| 173 | // Time to condense the distribution stuff for the words | 235 | // Time to condense the distribution stuff for the words |
| 236 | std::cout << "Compiling token histograms..." << std::endl; | ||
| 174 | for (auto& it : words) | 237 | for (auto& it : words) |
| 175 | { | 238 | { |
| 176 | it.second.forms.compile(); | 239 | it.second.forms.compile(); |
| @@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 185 | 248 | ||
| 186 | hashtags.forms.compile(); | 249 | hashtags.forms.compile(); |
| 187 | hashtags.terms.compile(); | 250 | hashtags.terms.compile(); |
| 251 | |||
| 252 | // Compile other freevars | ||
| 253 | emoticons.forms.compile(); | ||
| 254 | emoticons.terms.compile(); | ||
| 188 | 255 | ||
| 189 | // kgram distribution | 256 | // kgram distribution |
| 257 | std::cout << "Creating markov chain..." << std::endl; | ||
| 190 | std::map<kgram, std::map<token, token_data> > tstats; | 258 | std::map<kgram, std::map<token, token_data> > tstats; |
| 191 | for (int k=1; k<maxK; k++) | 259 | for (int k=1; k<maxK; k++) |
| 192 | { | 260 | { |
| @@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 246 | } | 314 | } |
| 247 | 315 | ||
| 248 | // Condense the kgram distribution | 316 | // Condense the kgram distribution |
| 317 | std::cout << "Compiling kgram distributions..." << std::endl; | ||
| 249 | for (auto& it : tstats) | 318 | for (auto& it : tstats) |
| 250 | { | 319 | { |
| 251 | kgram klist = it.first; | 320 | kgram klist = it.first; |
| @@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n) | |||
| 454 | 523 | ||
| 455 | open_delimiters.pop(); | 524 | open_delimiters.pop(); |
| 456 | } | 525 | } |
| 526 | |||
| 527 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | ||
| 528 | std::vector<std::string> fv_names; | ||
| 529 | std::ifstream namefile("names.txt"); | ||
| 530 | while (!namefile.eof()) | ||
| 531 | { | ||
| 532 | std::string l; | ||
| 533 | getline(namefile, l); | ||
| 534 | fv_names.push_back(l); | ||
| 535 | } | ||
| 536 | |||
| 537 | int cpos; | ||
| 538 | while ((cpos = result.find("$name$")) != std::string::npos) | ||
| 539 | { | ||
| 540 | result.replace(cpos, 6, fv_names[rand() % fv_names.size()]); | ||
| 541 | } | ||
| 542 | |||
| 543 | std::vector<std::string> fv_nouns; | ||
| 544 | std::ifstream nounfile("nouns.txt"); | ||
| 545 | while (!nounfile.eof()) | ||
| 546 | { | ||
| 547 | std::string l; | ||
| 548 | getline(nounfile, l); | ||
| 549 | fv_nouns.push_back(l); | ||
| 550 | } | ||
| 551 | |||
| 552 | while ((cpos = result.find("$noun$")) != std::string::npos) | ||
| 553 | { | ||
| 554 | result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]); | ||
| 555 | } | ||
| 457 | 556 | ||
| 458 | return result; | 557 | return result; |
| 459 | } | 558 | } |
