diff options
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 105 |
1 files changed, 102 insertions, 3 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 0ab0c99..5b571d6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -39,6 +39,9 @@ | |||
39 | #include <algorithm> | 39 | #include <algorithm> |
40 | #include <set> | 40 | #include <set> |
41 | #include <stack> | 41 | #include <stack> |
42 | #include "freevars.h" | ||
43 | #include <fstream> | ||
44 | #include "prefix_search.h" | ||
42 | 45 | ||
43 | query wildcardQuery {querytype::sentence}; | 46 | query wildcardQuery {querytype::sentence}; |
44 | word blank_word {""}; | 47 | word blank_word {""}; |
@@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
53 | size_t start = 0; | 56 | size_t start = 0; |
54 | int end = 0; | 57 | int end = 0; |
55 | std::set<std::string> thashtags; | 58 | std::set<std::string> thashtags; |
59 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | ||
60 | |||
61 | std::cout << "Reading emojis..." << std::endl; | ||
62 | prefix_search emojis; | ||
63 | std::ifstream emoji_file("emojis.txt"); | ||
64 | if (emoji_file) | ||
65 | { | ||
66 | while (!emoji_file.eof()) | ||
67 | { | ||
68 | std::string rawmojis; | ||
69 | getline(emoji_file, rawmojis); | ||
70 | emojis.add(rawmojis); | ||
71 | } | ||
72 | |||
73 | emoji_file.close(); | ||
74 | } | ||
56 | 75 | ||
76 | std::cout << "Tokenizing corpus..." << std::endl; | ||
57 | while (end != std::string::npos) | 77 | while (end != std::string::npos) |
58 | { | 78 | { |
59 | end = corpus.find(" ", start); | 79 | end = corpus.find(" ", start); |
60 | 80 | ||
61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 81 | bool emoji = false; |
62 | if (t.compare("") && t.compare(".")) | 82 | std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
83 | std::string t = ""; | ||
84 | |||
85 | if (te.compare("") && te.compare(".")) | ||
63 | { | 86 | { |
87 | // Extract strings of emojis into their own tokens even if they're not space delimited | ||
88 | int m = emojis.match(te); | ||
89 | emoji = m > 0; | ||
90 | if (m == 0) m = 1; | ||
91 | t = te.substr(0,m); | ||
92 | te = te.substr(m); | ||
93 | |||
94 | while (!te.empty()) | ||
95 | { | ||
96 | m = emojis.match(te); | ||
97 | if (emoji == (m > 0)) | ||
98 | { | ||
99 | if (m == 0) m = 1; | ||
100 | t += te.substr(0,m); | ||
101 | te = te.substr(m); | ||
102 | } else { | ||
103 | end = start + t.length() - 1; | ||
104 | break; | ||
105 | } | ||
106 | } | ||
107 | |||
64 | std::string tc(t), canonical; | 108 | std::string tc(t), canonical; |
65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | 110 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { |
@@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
72 | if (canonical[0] == '#') | 116 | if (canonical[0] == '#') |
73 | { | 117 | { |
74 | thashtags.insert(canonical); | 118 | thashtags.insert(canonical); |
75 | canonical = "#hashtag"; | ||
76 | 119 | ||
77 | return hashtags; | 120 | return hashtags; |
78 | } | 121 | } |
79 | 122 | ||
123 | // Emoticon freevar | ||
124 | if (emoji) | ||
125 | { | ||
126 | emoticons.forms.add(canonical); | ||
127 | |||
128 | return emoticons; | ||
129 | } | ||
130 | |||
131 | std::string emoticon_canon; | ||
132 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { | ||
133 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
134 | }); | ||
135 | if (fv_emoticons.check(emoticon_canon)) | ||
136 | { | ||
137 | emoticons.forms.add(emoticon_canon); | ||
138 | |||
139 | return emoticons; | ||
140 | } | ||
141 | |||
80 | // Basically any other word | 142 | // Basically any other word |
81 | if (words.count(canonical) == 0) | 143 | if (words.count(canonical) == 0) |
82 | { | 144 | { |
@@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
171 | } | 233 | } |
172 | 234 | ||
173 | // Time to condense the distribution stuff for the words | 235 | // Time to condense the distribution stuff for the words |
236 | std::cout << "Compiling token histograms..." << std::endl; | ||
174 | for (auto& it : words) | 237 | for (auto& it : words) |
175 | { | 238 | { |
176 | it.second.forms.compile(); | 239 | it.second.forms.compile(); |
@@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
185 | 248 | ||
186 | hashtags.forms.compile(); | 249 | hashtags.forms.compile(); |
187 | hashtags.terms.compile(); | 250 | hashtags.terms.compile(); |
251 | |||
252 | // Compile other freevars | ||
253 | emoticons.forms.compile(); | ||
254 | emoticons.terms.compile(); | ||
188 | 255 | ||
189 | // kgram distribution | 256 | // kgram distribution |
257 | std::cout << "Creating markov chain..." << std::endl; | ||
190 | std::map<kgram, std::map<token, token_data> > tstats; | 258 | std::map<kgram, std::map<token, token_data> > tstats; |
191 | for (int k=1; k<maxK; k++) | 259 | for (int k=1; k<maxK; k++) |
192 | { | 260 | { |
@@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
246 | } | 314 | } |
247 | 315 | ||
248 | // Condense the kgram distribution | 316 | // Condense the kgram distribution |
317 | std::cout << "Compiling kgram distributions..." << std::endl; | ||
249 | for (auto& it : tstats) | 318 | for (auto& it : tstats) |
250 | { | 319 | { |
251 | kgram klist = it.first; | 320 | kgram klist = it.first; |
@@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n) | |||
454 | 523 | ||
455 | open_delimiters.pop(); | 524 | open_delimiters.pop(); |
456 | } | 525 | } |
526 | |||
527 | // Replace old-style freevars while I can't be bothered to remake the corpus yet | ||
528 | std::vector<std::string> fv_names; | ||
529 | std::ifstream namefile("names.txt"); | ||
530 | while (!namefile.eof()) | ||
531 | { | ||
532 | std::string l; | ||
533 | getline(namefile, l); | ||
534 | fv_names.push_back(l); | ||
535 | } | ||
536 | |||
537 | int cpos; | ||
538 | while ((cpos = result.find("$name$")) != std::string::npos) | ||
539 | { | ||
540 | result.replace(cpos, 6, fv_names[rand() % fv_names.size()]); | ||
541 | } | ||
542 | |||
543 | std::vector<std::string> fv_nouns; | ||
544 | std::ifstream nounfile("nouns.txt"); | ||
545 | while (!nounfile.eof()) | ||
546 | { | ||
547 | std::string l; | ||
548 | getline(nounfile, l); | ||
549 | fv_nouns.push_back(l); | ||
550 | } | ||
551 | |||
552 | while ((cpos = result.find("$noun$")) != std::string::npos) | ||
553 | { | ||
554 | result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]); | ||
555 | } | ||
457 | 556 | ||
458 | return result; | 557 | return result; |
459 | } | 558 | } |