about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-01 09:30:04 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-01 09:30:04 -0500
commit617155fe562652c859a380d85cc5710783d79448 (patch)
treef5eee89b0fa4b3c9dfe7187ca78916a71b59045e /kgramstats.cpp
parentb316e309559d7176af6cf0bb7dcd6dbaa83c01cd (diff)
downloadrawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.gz
rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.bz2
rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.zip
Added emoji freevar
Strings of emojis are tokenized separately from anything else, and added to an emoticon freevar, which is mixed in with regular emoticons like :P. This breaks old-style freevars like $name$ and $noun$ so some legacy support for compatibility is left in but eventually $name$ should be made into an actual new freevar. Emoji data is from gemoji (https://github.com/github/gemoji).
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp105
1 files changed, 102 insertions, 3 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 0ab0c99..5b571d6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -39,6 +39,9 @@
39#include <algorithm> 39#include <algorithm>
40#include <set> 40#include <set>
41#include <stack> 41#include <stack>
42#include "freevars.h"
43#include <fstream>
44#include "prefix_search.h"
42 45
43query wildcardQuery {querytype::sentence}; 46query wildcardQuery {querytype::sentence};
44word blank_word {""}; 47word blank_word {""};
@@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK)
53 size_t start = 0; 56 size_t start = 0;
54 int end = 0; 57 int end = 0;
55 std::set<std::string> thashtags; 58 std::set<std::string> thashtags;
59 freevar fv_emoticons {emoticons, "emoticons.txt"};
60
61 std::cout << "Reading emojis..." << std::endl;
62 prefix_search emojis;
63 std::ifstream emoji_file("emojis.txt");
64 if (emoji_file)
65 {
66 while (!emoji_file.eof())
67 {
68 std::string rawmojis;
69 getline(emoji_file, rawmojis);
70 emojis.add(rawmojis);
71 }
72
73 emoji_file.close();
74 }
56 75
76 std::cout << "Tokenizing corpus..." << std::endl;
57 while (end != std::string::npos) 77 while (end != std::string::npos)
58 { 78 {
59 end = corpus.find(" ", start); 79 end = corpus.find(" ", start);
60 80
61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 81 bool emoji = false;
62 if (t.compare("") && t.compare(".")) 82 std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
83 std::string t = "";
84
85 if (te.compare("") && te.compare("."))
63 { 86 {
87 // Extract strings of emojis into their own tokens even if they're not space delimited
88 int m = emojis.match(te);
89 emoji = m > 0;
90 if (m == 0) m = 1;
91 t = te.substr(0,m);
92 te = te.substr(m);
93
94 while (!te.empty())
95 {
96 m = emojis.match(te);
97 if (emoji == (m > 0))
98 {
99 if (m == 0) m = 1;
100 t += te.substr(0,m);
101 te = te.substr(m);
102 } else {
103 end = start + t.length() - 1;
104 break;
105 }
106 }
107
64 std::string tc(t), canonical; 108 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 109 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { 110 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
@@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK)
72 if (canonical[0] == '#') 116 if (canonical[0] == '#')
73 { 117 {
74 thashtags.insert(canonical); 118 thashtags.insert(canonical);
75 canonical = "#hashtag";
76 119
77 return hashtags; 120 return hashtags;
78 } 121 }
79 122
123 // Emoticon freevar
124 if (emoji)
125 {
126 emoticons.forms.add(canonical);
127
128 return emoticons;
129 }
130
131 std::string emoticon_canon;
132 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) {
133 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
134 });
135 if (fv_emoticons.check(emoticon_canon))
136 {
137 emoticons.forms.add(emoticon_canon);
138
139 return emoticons;
140 }
141
80 // Basically any other word 142 // Basically any other word
81 if (words.count(canonical) == 0) 143 if (words.count(canonical) == 0)
82 { 144 {
@@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 233 }
172 234
173 // Time to condense the distribution stuff for the words 235 // Time to condense the distribution stuff for the words
236 std::cout << "Compiling token histograms..." << std::endl;
174 for (auto& it : words) 237 for (auto& it : words)
175 { 238 {
176 it.second.forms.compile(); 239 it.second.forms.compile();
@@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
185 248
186 hashtags.forms.compile(); 249 hashtags.forms.compile();
187 hashtags.terms.compile(); 250 hashtags.terms.compile();
251
252 // Compile other freevars
253 emoticons.forms.compile();
254 emoticons.terms.compile();
188 255
189 // kgram distribution 256 // kgram distribution
257 std::cout << "Creating markov chain..." << std::endl;
190 std::map<kgram, std::map<token, token_data> > tstats; 258 std::map<kgram, std::map<token, token_data> > tstats;
191 for (int k=1; k<maxK; k++) 259 for (int k=1; k<maxK; k++)
192 { 260 {
@@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
246 } 314 }
247 315
248 // Condense the kgram distribution 316 // Condense the kgram distribution
317 std::cout << "Compiling kgram distributions..." << std::endl;
249 for (auto& it : tstats) 318 for (auto& it : tstats)
250 { 319 {
251 kgram klist = it.first; 320 kgram klist = it.first;
@@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n)
454 523
455 open_delimiters.pop(); 524 open_delimiters.pop();
456 } 525 }
526
527 // Replace old-style freevars while I can't be bothered to remake the corpus yet
528 std::vector<std::string> fv_names;
529 std::ifstream namefile("names.txt");
530 while (!namefile.eof())
531 {
532 std::string l;
533 getline(namefile, l);
534 fv_names.push_back(l);
535 }
536
537 int cpos;
538 while ((cpos = result.find("$name$")) != std::string::npos)
539 {
540 result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
541 }
542
543 std::vector<std::string> fv_nouns;
544 std::ifstream nounfile("nouns.txt");
545 while (!nounfile.eof())
546 {
547 std::string l;
548 getline(nounfile, l);
549 fv_nouns.push_back(l);
550 }
551
552 while ((cpos = result.find("$noun$")) != std::string::npos)
553 {
554 result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]);
555 }
457 556
458 return result; 557 return result;
459} 558}