diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
commit | b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch) | |
tree | f21bd883ef7c4255a91d096ea105feaad135ee52 | |
parent | fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff) | |
download | rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip |
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
-rw-r--r-- | CMakeLists.txt | 8 | ||||
-rw-r--r-- | ebooks.cpp | 15 | ||||
-rw-r--r-- | freevars.cpp | 4 | ||||
-rw-r--r-- | gen.cpp | 15 | ||||
-rw-r--r-- | histogram.cpp | 34 | ||||
-rw-r--r-- | histogram.h | 19 | ||||
-rw-r--r-- | kgramstats.cpp | 453 | ||||
-rw-r--r-- | kgramstats.h | 124 |
8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
@@ -8,10 +8,14 @@ find_package(curl) | |||
8 | if (YamlCpp_FOUND AND CURL_FOUND) | 8 | if (YamlCpp_FOUND AND CURL_FOUND) |
9 | add_subdirectory(vendor/twitcurl/libtwitcurl) | 9 | add_subdirectory(vendor/twitcurl/libtwitcurl) |
10 | include_directories(vendor/twitcurl/libtwitcurl) | 10 | include_directories(vendor/twitcurl/libtwitcurl) |
11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
12 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) | ||
13 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) | ||
12 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) | 14 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) |
13 | else (YamlCpp_FOUND AND CURL_FOUND) | 15 | else (YamlCpp_FOUND AND CURL_FOUND) |
14 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") | 16 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") |
15 | endif (YamlCpp_FOUND AND CURL_FOUND) | 17 | endif (YamlCpp_FOUND AND CURL_FOUND) |
16 | 18 | ||
17 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) | 19 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp) |
20 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) | ||
21 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) | ||
diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -44,20 +44,9 @@ int main(int argc, char** args) | |||
44 | std::cout << "Generating..." << std::endl; | 44 | std::cout << "Generating..." << std::endl; |
45 | for (;;) | 45 | for (;;) |
46 | { | 46 | { |
47 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); | 47 | std::string doc = stats->randomSentence(rand() % 45 + 5); |
48 | std::string hi; | 48 | std::string hi = vars->parse(doc); |
49 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
50 | { | ||
51 | hi += vars->parse(*it) + " "; | ||
52 | } | ||
53 | |||
54 | hi.resize(140); | 49 | hi.resize(140); |
55 | |||
56 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
58 | { | ||
59 | hi = hi.substr(0, lastperiod+1); | ||
60 | } | ||
61 | 50 | ||
62 | std::string replyMsg; | 51 | std::string replyMsg; |
63 | if (twitter.statusUpdate(hi)) | 52 | if (twitter.statusUpdate(hi)) |
diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp | |||
@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in) | |||
34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) | 34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) |
35 | { | 35 | { |
36 | std::string tofind = "$" + it->first + "$"; | 36 | std::string tofind = "$" + it->first + "$"; |
37 | size_t fpos = res.find(tofind); | 37 | size_t fpos; |
38 | if (fpos != std::string::npos) | 38 | while ((fpos = res.find(tofind)) != std::string::npos) |
39 | { | 39 | { |
40 | int r = rand() % it->second->size(); | 40 | int r = rand() % it->second->size(); |
41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); | 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); |
diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -52,21 +52,10 @@ int main(int argc, char** args) | |||
52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
53 | for (;;) | 53 | for (;;) |
54 | { | 54 | { |
55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::string doc = stats->randomSentence(rand() % 35 + 15); |
56 | std::string hi; | 56 | std::string hi = vars->parse(doc); |
57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | ||
58 | { | ||
59 | hi += vars->parse(*it) + " "; | ||
60 | } | ||
61 | |||
62 | hi.resize(140); | 57 | hi.resize(140); |
63 | 58 | ||
64 | size_t lastperiod = hi.find_last_of(".!?,"); | ||
65 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | ||
66 | { | ||
67 | hi = hi.substr(0, lastperiod+1); | ||
68 | } | ||
69 | |||
70 | std::cout << hi << std::endl; | 59 | std::cout << hi << std::endl; |
71 | 60 | ||
72 | getc(stdin); | 61 | getc(stdin); |
diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp | |||
@@ -0,0 +1,34 @@ | |||
1 | #include "histogram.h" | ||
2 | #include <cstdlib> | ||
3 | |||
4 | template <class T> | ||
5 | void histogram<T>::add(const T& inst) | ||
6 | { | ||
7 | freqtable[inst]++; | ||
8 | } | ||
9 | |||
10 | template <class T> | ||
11 | void histogram<T>::compile() | ||
12 | { | ||
13 | distribution.clear(); | ||
14 | |||
15 | int max = 0; | ||
16 | for (auto& it : freqtable) | ||
17 | { | ||
18 | max += it.second; | ||
19 | distribution.emplace(max, it.first); | ||
20 | } | ||
21 | |||
22 | freqtable.clear(); | ||
23 | } | ||
24 | |||
25 | template <class T> | ||
26 | const T& histogram<T>::next() const | ||
27 | { | ||
28 | int max = distribution.rbegin()->first; | ||
29 | int r = rand() % max; | ||
30 | |||
31 | return distribution.upper_bound(r)->second; | ||
32 | } | ||
33 | |||
34 | template class histogram <std::string>; | ||
diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h | |||
@@ -0,0 +1,19 @@ | |||
1 | #ifndef HISTOGRAM_H_24094D97 | ||
2 | #define HISTOGRAM_H_24094D97 | ||
3 | |||
4 | #include <map> | ||
5 | #include <string> | ||
6 | |||
7 | template <class T> | ||
8 | class histogram { | ||
9 | public: | ||
10 | void add(const T& inst); | ||
11 | void compile(); | ||
12 | const T& next() const; | ||
13 | |||
14 | private: | ||
15 | std::map<T, int> freqtable; | ||
16 | std::map<int, T> distribution; | ||
17 | }; | ||
18 | |||
19 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -37,35 +37,11 @@ | |||
37 | #include <iostream> | 37 | #include <iostream> |
38 | #include <cstdlib> | 38 | #include <cstdlib> |
39 | #include <algorithm> | 39 | #include <algorithm> |
40 | #include "malaprop.h" | 40 | #include <set> |
41 | #include <stack> | ||
41 | 42 | ||
42 | query wildcardQuery(querytype_sentence); | 43 | query wildcardQuery {querytype::sentence}; |
43 | 44 | word blank_word {""}; | |
44 | std::string canonize(std::string f); | ||
45 | |||
46 | token token_from_string(std::string in) | ||
47 | { | ||
48 | if (in[0] == '#') | ||
49 | { | ||
50 | token word(tokentype_hashtag); | ||
51 | |||
52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
53 | { | ||
54 | word.terminating = true; | ||
55 | } | ||
56 | |||
57 | return word; | ||
58 | } else { | ||
59 | token word(canonize(in)); | ||
60 | |||
61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
62 | { | ||
63 | word.terminating = true; | ||
64 | } | ||
65 | |||
66 | return word; | ||
67 | } | ||
68 | } | ||
69 | 45 | ||
70 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 46 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
71 | // We consider maxK to be fairly constant | 47 | // We consider maxK to be fairly constant |
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
73 | { | 49 | { |
74 | this->maxK = maxK; | 50 | this->maxK = maxK; |
75 | 51 | ||
76 | std::vector<std::string> tokens; | 52 | std::vector<token> tokens; |
77 | size_t start = 0; | 53 | size_t start = 0; |
78 | int end = 0; | 54 | int end = 0; |
79 | std::set<std::string> thashtags; | 55 | std::set<std::string> thashtags; |
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
82 | { | 58 | { |
83 | end = corpus.find(" ", start); | 59 | end = corpus.find(" ", start); |
84 | 60 | ||
85 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 61 | std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
86 | if (token[token.length()-1] == '\n') | 62 | if (t.compare("") && t.compare(".")) |
87 | { | 63 | { |
88 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) | 64 | std::string tc(t), canonical; |
65 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | ||
66 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | ||
67 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
68 | }); | ||
69 | |||
70 | word& w = ([&] () -> word& { | ||
71 | // Hashtag freevar | ||
72 | if (canonical[0] == '#') | ||
73 | { | ||
74 | thashtags.insert(canonical); | ||
75 | canonical = "#hashtag"; | ||
76 | |||
77 | return hashtags; | ||
78 | } | ||
79 | |||
80 | // Basically any other word | ||
81 | if (words.count(canonical) == 0) | ||
82 | { | ||
83 | words.emplace(canonical, canonical); | ||
84 | } | ||
85 | |||
86 | word& tw = words.at(canonical); | ||
87 | tw.forms.add(canonical); | ||
88 | |||
89 | return tw; | ||
90 | })(); | ||
91 | |||
92 | token tk(w); | ||
93 | tk.raw = t; | ||
94 | |||
95 | for (char c : t) | ||
89 | { | 96 | { |
90 | token.insert(token.length()-1, "."); | 97 | if (c == '*') |
98 | { | ||
99 | tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; | ||
100 | } else if (c == '[') | ||
101 | { | ||
102 | tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; | ||
103 | } else if (c == '(') | ||
104 | { | ||
105 | tk.delimiters[{parentype::paren, doublestatus::opening}]++; | ||
106 | } else if (c == '"') | ||
107 | { | ||
108 | tk.delimiters[{parentype::quote, doublestatus::opening}]++; | ||
109 | } else { | ||
110 | break; | ||
111 | } | ||
91 | } | 112 | } |
92 | |||
93 | token.resize(token.length()-1); | ||
94 | } | ||
95 | |||
96 | if (token.compare("") && token.compare(".")) | ||
97 | { | ||
98 | mstats.addWord(token); | ||
99 | tokens.push_back(token); | ||
100 | 113 | ||
101 | if (token[0] == '#') | 114 | int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; |
115 | if (backtrack != t.length()) | ||
102 | { | 116 | { |
103 | thashtags.insert(canonize(token)); | 117 | std::string ending = t.substr(backtrack); |
118 | std::string suffix; | ||
119 | |||
120 | for (char c : ending) | ||
121 | { | ||
122 | if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) | ||
123 | { | ||
124 | suffix += c; | ||
125 | |||
126 | continue; | ||
127 | } else if (c == '\n') | ||
128 | { | ||
129 | // At least the end is coming | ||
130 | if (suffix.empty()) | ||
131 | { | ||
132 | suffix = "."; | ||
133 | } | ||
134 | |||
135 | break; | ||
136 | } | ||
137 | |||
138 | parentype pt = ([&] { | ||
139 | switch (c) | ||
140 | { | ||
141 | case ']': return parentype::square_bracket; | ||
142 | case ')': return parentype::paren; | ||
143 | case '*': return parentype::asterisk; | ||
144 | case '"': return parentype::quote; | ||
145 | } | ||
146 | })(); | ||
147 | |||
148 | if (tk.delimiters[{pt, doublestatus::opening}] > 0) | ||
149 | { | ||
150 | tk.delimiters[{pt, doublestatus::opening}]--; | ||
151 | tk.delimiters[{pt, doublestatus::both}]++; | ||
152 | } else { | ||
153 | tk.delimiters[{pt, doublestatus::closing}]++; | ||
154 | } | ||
155 | } | ||
156 | |||
157 | if (suffix == ",") | ||
158 | { | ||
159 | tk.suffix = suffixtype::comma; | ||
160 | } else if (!suffix.empty()) { | ||
161 | tk.suffix = suffixtype::terminating; | ||
162 | |||
163 | w.terms.add(suffix); | ||
164 | } | ||
104 | } | 165 | } |
166 | |||
167 | tokens.push_back(tk); | ||
105 | } | 168 | } |
106 | 169 | ||
107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 170 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
108 | } | 171 | } |
109 | 172 | ||
110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | 173 | // Time to condense the distribution stuff for the words |
174 | for (auto& it : words) | ||
111 | { | 175 | { |
112 | hashtags.push_back(*it); | 176 | it.second.forms.compile(); |
177 | it.second.terms.compile(); | ||
113 | } | 178 | } |
114 | 179 | ||
180 | // Hashtag freevar is not frequency distributed | ||
181 | for (auto& it : thashtags) | ||
182 | { | ||
183 | hashtags.forms.add(it); | ||
184 | } | ||
185 | |||
186 | hashtags.forms.compile(); | ||
187 | hashtags.terms.compile(); | ||
188 | |||
189 | // kgram distribution | ||
115 | std::map<kgram, std::map<token, token_data> > tstats; | 190 | std::map<kgram, std::map<token, token_data> > tstats; |
116 | std::map<token, std::map<termstats, int> > tendings; | ||
117 | for (int k=1; k<maxK; k++) | 191 | for (int k=1; k<maxK; k++) |
118 | { | 192 | { |
119 | for (int i=0; i<(tokens.size() - k); i++) | 193 | for (int i=0; i<(tokens.size() - k); i++) |
120 | { | 194 | { |
121 | std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); | 195 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); |
122 | kgram prefix; | 196 | token f = tokens[i+k]; |
123 | 197 | ||
124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 198 | if (tstats[prefix].count(f) == 0) |
125 | { | ||
126 | prefix.push_back(token_from_string(*it)); | ||
127 | } | ||
128 | |||
129 | std::string f = tokens[i+k]; | ||
130 | std::string canonical = canonize(f); | ||
131 | |||
132 | token word(token_from_string(canonical)); | ||
133 | if (f.find_first_of(".?!,") != std::string::npos) | ||
134 | { | 199 | { |
135 | word.terminating = true; | 200 | tstats[prefix].emplace(f, f); |
136 | |||
137 | char terminator = f[f.find_last_of(".?!,")]; | ||
138 | int occurrences = std::count(f.begin(), f.end(), terminator); | ||
139 | |||
140 | tendings[word][termstats(terminator, occurrences)]++; | ||
141 | } | 201 | } |
142 | 202 | ||
143 | token_data& td = tstats[prefix][word]; | 203 | token_data& td = tstats[prefix].at(f); |
144 | td.word = word; | ||
145 | td.all++; | 204 | td.all++; |
146 | 205 | ||
147 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 206 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
148 | { | 207 | { |
149 | td.uppercase++; | 208 | td.uppercase++; |
150 | } else if (isupper(f[0])) | 209 | } else if (isupper(f.raw[0])) |
151 | { | 210 | { |
152 | td.titlecase++; | 211 | td.titlecase++; |
153 | } | 212 | } |
154 | 213 | ||
155 | if (prefix.front().word.terminating) | 214 | kgram term_prefix; |
215 | bool changed = false; | ||
216 | std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) { | ||
217 | if (q.tok.suffix == suffixtype::terminating) | ||
218 | { | ||
219 | changed = true; | ||
220 | |||
221 | return wildcardQuery; | ||
222 | } else { | ||
223 | return q; | ||
224 | } | ||
225 | }); | ||
226 | |||
227 | if (changed) | ||
156 | { | 228 | { |
157 | prefix.front() = wildcardQuery; | 229 | if (tstats[term_prefix].count(f) == 0) |
230 | { | ||
231 | tstats[term_prefix].emplace(f, f); | ||
232 | } | ||
158 | 233 | ||
159 | token_data& td2 = tstats[prefix][word]; | 234 | token_data& td2 = tstats[term_prefix].at(f); |
160 | td2.word = word; | ||
161 | td2.all++; | 235 | td2.all++; |
162 | 236 | ||
163 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 237 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
164 | { | 238 | { |
165 | td2.uppercase++; | 239 | td2.uppercase++; |
166 | } else if (isupper(f[0])) | 240 | } else if (isupper(f.raw[0])) |
167 | { | 241 | { |
168 | td2.titlecase++; | 242 | td2.titlecase++; |
169 | } | 243 | } |
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
171 | } | 245 | } |
172 | } | 246 | } |
173 | 247 | ||
174 | for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) | 248 | // Condense the kgram distribution |
249 | for (auto& it : tstats) | ||
175 | { | 250 | { |
176 | kgram klist = it->first; | 251 | kgram klist = it.first; |
177 | std::map<token, token_data>& probtable = it->second; | 252 | auto& probtable = it.second; |
178 | std::map<int, token_data>& distribution = stats[klist]; | 253 | auto& distribution = stats[klist]; |
179 | int max = 0; | 254 | int max = 0; |
180 | 255 | ||
181 | for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | 256 | for (auto& kt : probtable) |
182 | { | 257 | { |
183 | max += kt->second.all; | 258 | max += kt.second.all; |
184 | 259 | ||
185 | distribution[max] = kt->second; | 260 | distribution.emplace(max, kt.second); |
186 | } | ||
187 | } | ||
188 | |||
189 | for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++) | ||
190 | { | ||
191 | token word = it->first; | ||
192 | std::map<termstats, int>& probtable = it->second; | ||
193 | std::map<int, termstats>& distribution = endings[word]; | ||
194 | int max = 0; | ||
195 | |||
196 | for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | ||
197 | { | ||
198 | max += kt->second; | ||
199 | |||
200 | distribution[max] = kt->first; | ||
201 | } | 261 | } |
202 | } | 262 | } |
203 | } | 263 | } |
204 | 264 | ||
205 | void printKgram(kgram k) | 265 | void printKgram(kgram k) |
206 | { | 266 | { |
207 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 267 | for (auto& q : k) |
208 | { | 268 | { |
209 | query& q = *it; | 269 | if (q.type == querytype::sentence) |
210 | if (q.type == querytype_sentence) | ||
211 | { | 270 | { |
212 | std::cout << "#.# "; | 271 | std::cout << "#.# "; |
213 | } else if (q.type == querytype_literal) | 272 | } else if (q.type == querytype::literal) |
214 | { | 273 | { |
215 | if (q.word.type == tokentype_hashtag) | 274 | if (q.tok.suffix == suffixtype::terminating) |
216 | { | 275 | { |
217 | if (q.word.terminating) | 276 | std::cout << q.tok.w.canon << ". "; |
218 | { | 277 | } else if (q.tok.suffix == suffixtype::comma) |
219 | std::cout << "#hashtag. "; | ||
220 | } else { | ||
221 | std::cout << "#hashtag "; | ||
222 | } | ||
223 | } else if (q.word.type == tokentype_literal) | ||
224 | { | 278 | { |
225 | if (q.word.terminating) | 279 | std::cout << q.tok.w.canon << ", "; |
226 | { | 280 | } else { |
227 | std::cout << q.word.canon << ". "; | 281 | std::cout << q.tok.w.canon << " "; |
228 | } else { | ||
229 | std::cout << q.word.canon << " "; | ||
230 | } | ||
231 | } | 282 | } |
232 | } | 283 | } |
233 | } | 284 | } |
234 | } | 285 | } |
235 | 286 | ||
236 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 287 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
237 | std::vector<std::string> kgramstats::randomSentence(int n) | 288 | std::string kgramstats::randomSentence(int n) |
238 | { | 289 | { |
239 | std::vector<std::string> result; | 290 | std::string result; |
240 | kgram cur(1, wildcardQuery); | 291 | kgram cur(1, wildcardQuery); |
241 | int cuts = 0; | 292 | int cuts = 0; |
293 | std::stack<parentype> open_delimiters; | ||
242 | 294 | ||
243 | for (int i=0; i<n; i++) | 295 | for (int i=0; i<n; i++) |
244 | { | 296 | { |
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
273 | cur = kgram(1, wildcardQuery); | 325 | cur = kgram(1, wildcardQuery); |
274 | } | 326 | } |
275 | 327 | ||
276 | std::map<int, token_data>& distribution = stats[cur]; | 328 | auto& distribution = stats[cur]; |
277 | int max = distribution.rbegin()->first; | 329 | int max = distribution.rbegin()->first; |
278 | int r = rand() % max; | 330 | int r = rand() % max; |
279 | token_data& next = distribution.upper_bound(r)->second; | 331 | token_data& next = distribution.upper_bound(r)->second; |
280 | std::string nextToken; | 332 | std::string nextToken = next.tok.w.forms.next(); |
281 | bool mess = false; | 333 | |
282 | 334 | // Determine the casing of the next token. We randomly make the token all | |
283 | if (next.word.type == tokentype_literal) | 335 | // caps based on the markov chain. Otherwise, we check if the previous |
336 | // token is the end of a sentence (terminating token or a wildcard query). | ||
337 | int casing = rand() % next.all; | ||
338 | if (casing < next.uppercase) | ||
339 | { | ||
340 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
341 | } else if ((((cur.rbegin()->type == querytype::sentence) | ||
342 | || ((cur.rbegin()->type == querytype::literal) | ||
343 | && (cur.rbegin()->tok.suffix == suffixtype::terminating))) | ||
344 | && (rand() % 2 > 0)) | ||
345 | || (casing - next.uppercase < next.titlecase)) | ||
284 | { | 346 | { |
285 | nextToken = next.word.canon; | 347 | nextToken[0] = toupper(nextToken[0]); |
348 | } | ||
286 | 349 | ||
287 | mess = (rand() % 100) == 0; | 350 | // Delimiters |
288 | if (mess) | 351 | for (auto& dt : next.tok.delimiters) |
352 | { | ||
353 | if (dt.first.status == doublestatus::both) | ||
289 | { | 354 | { |
290 | nextToken = mstats.alternate(nextToken); | 355 | switch (dt.first.type) |
291 | } | 356 | { |
292 | 357 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break; | |
293 | // Determine the casing of the next token. We randomly make the token all | 358 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break; |
294 | // caps based on the markov chain. Otherwise, we check if the previous | 359 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break; |
295 | // token is the end of a sentence (terminating token or a wildcard query). | 360 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break; |
296 | int casing = rand() % next.all; | 361 | } |
297 | if (casing < next.uppercase) | 362 | } else if (dt.first.status == doublestatus::opening) |
298 | { | 363 | { |
299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 364 | for (int i=0; i<dt.second; i++) |
300 | } else if ((((cur.rbegin()->type == querytype_sentence) | 365 | { |
301 | || ((cur.rbegin()->type == querytype_literal) | 366 | open_delimiters.push(dt.first.type); |
302 | && (cur.rbegin()->word.terminating))) | 367 | } |
303 | && (rand() % 2 > 0)) | 368 | |
304 | || (casing - next.uppercase < next.titlecase)) | 369 | switch (dt.first.type) |
370 | { | ||
371 | case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break; | ||
372 | case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break; | ||
373 | case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break; | ||
374 | case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break; | ||
375 | } | ||
376 | } else if (dt.first.status == doublestatus::closing) | ||
305 | { | 377 | { |
306 | nextToken[0] = toupper(nextToken[0]); | 378 | for (int i=0; i<dt.second; i++) |
379 | { | ||
380 | while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type)) | ||
381 | { | ||
382 | switch (open_delimiters.top()) | ||
383 | { | ||
384 | case parentype::paren: nextToken.append(")"); break; | ||
385 | case parentype::square_bracket: nextToken.append("]"); break; | ||
386 | case parentype::asterisk: nextToken.append("*"); break; | ||
387 | case parentype::quote: nextToken.append("\""); break; | ||
388 | } | ||
389 | |||
390 | open_delimiters.pop(); | ||
391 | } | ||
392 | |||
393 | if (open_delimiters.empty()) | ||
394 | { | ||
395 | switch (dt.first.type) | ||
396 | { | ||
397 | case parentype::paren: result = "(" + result; break; | ||
398 | case parentype::square_bracket: result = "[" + result; break; | ||
399 | case parentype::asterisk: result = "*" + result; break; | ||
400 | case parentype::quote: result = "\"" + result; break; | ||
401 | } | ||
402 | } | ||
403 | |||
404 | switch (dt.first.type) | ||
405 | { | ||
406 | case parentype::paren: nextToken.append(")"); break; | ||
407 | case parentype::square_bracket: nextToken.append("]"); break; | ||
408 | case parentype::asterisk: nextToken.append("*"); break; | ||
409 | case parentype::quote: nextToken.append("\""); break; | ||
410 | } | ||
411 | } | ||
307 | } | 412 | } |
308 | } else if (next.word.type == tokentype_hashtag) | ||
309 | { | ||
310 | int rhash = rand() % hashtags.size(); | ||
311 | nextToken = hashtags[rhash]; | ||
312 | } | 413 | } |
313 | 414 | ||
314 | if (next.word.terminating) | 415 | // Terminators |
416 | if (next.tok.suffix == suffixtype::terminating) | ||
315 | { | 417 | { |
316 | std::map<int, termstats>& ending = endings[next.word]; | 418 | nextToken.append(next.tok.w.terms.next()); |
317 | int emax = ending.rbegin()->first; | 419 | } else if (next.tok.suffix == suffixtype::comma) |
318 | int er = rand() % emax; | 420 | { |
319 | termstats& nextend = ending.upper_bound(er)->second; | 421 | nextToken.append(","); |
320 | |||
321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | ||
322 | } | 422 | } |
323 | 423 | ||
324 | /* DEBUG */ | 424 | /* DEBUG */ |
325 | printKgram(cur); | 425 | printKgram(cur); |
426 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | ||
427 | |||
428 | cur.push_back(next.tok); | ||
326 | 429 | ||
327 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; | 430 | result.append(nextToken + " "); |
328 | 431 | ||
329 | if (mess) | 432 | if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0)) |
330 | { | 433 | { |
331 | std::cout << " mala " << next.word.canon; | 434 | break; |
332 | } | 435 | } |
333 | |||
334 | std::cout << std::endl; | ||
335 | |||
336 | cur.push_back(next.word); | ||
337 | |||
338 | result.push_back(nextToken); | ||
339 | } | 436 | } |
340 | |||
341 | return result; | ||
342 | } | ||
343 | |||
344 | bool removeIf(char c) | ||
345 | { | ||
346 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); | ||
347 | } | ||
348 | |||
349 | std::string canonize(std::string f) | ||
350 | { | ||
351 | std::string canonical(f); | ||
352 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
353 | 437 | ||
354 | std::string result; | 438 | // Remove the trailing space |
355 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 439 | if (result.back() == ' ') |
440 | { | ||
441 | result.pop_back(); | ||
442 | } | ||
443 | |||
444 | // Close any open delimiters | ||
445 | while (!open_delimiters.empty()) | ||
446 | { | ||
447 | switch (open_delimiters.top()) | ||
448 | { | ||
449 | case parentype::paren: result.append(")"); break; | ||
450 | case parentype::square_bracket: result.append("]"); break; | ||
451 | case parentype::asterisk: result.append("*"); break; | ||
452 | case parentype::quote: result.append("\""); break; | ||
453 | } | ||
454 | |||
455 | open_delimiters.pop(); | ||
456 | } | ||
356 | 457 | ||
357 | return result; | 458 | return result; |
358 | } | 459 | } |
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -2,61 +2,89 @@ | |||
2 | #include <map> | 2 | #include <map> |
3 | #include <list> | 3 | #include <list> |
4 | #include <vector> | 4 | #include <vector> |
5 | #include "malaprop.h" | 5 | #include "histogram.h" |
6 | 6 | ||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | enum tokentype { | 10 | struct word { |
11 | tokentype_literal, | 11 | std::string canon; |
12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
13 | histogram<std::string> terms; | ||
14 | |||
15 | word(std::string canon) : canon(canon) {} | ||
16 | |||
17 | bool operator<(const word& other) const | ||
18 | { | ||
19 | return canon < other.canon; | ||
20 | } | ||
13 | }; | 21 | }; |
14 | 22 | ||
15 | struct token { | 23 | extern word blank_word; |
16 | tokentype type; | 24 | |
17 | std::string canon; | 25 | enum class suffixtype { |
18 | bool terminating; | 26 | none, |
27 | terminating, | ||
28 | comma | ||
29 | }; | ||
30 | |||
31 | enum class parentype { | ||
32 | paren, | ||
33 | square_bracket, | ||
34 | asterisk, | ||
35 | quote | ||
36 | }; | ||
37 | |||
38 | enum class doublestatus { | ||
39 | opening, | ||
40 | closing, | ||
41 | both | ||
42 | }; | ||
43 | |||
44 | struct delimiter { | ||
45 | parentype type; | ||
46 | doublestatus status; | ||
47 | |||
48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
19 | 49 | ||
20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
53 | } | ||
54 | }; | ||
55 | |||
56 | struct token { | ||
57 | const word& w; | ||
58 | std::map<delimiter, int> delimiters; | ||
59 | suffixtype suffix; | ||
60 | std::string raw; | ||
61 | |||
62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
22 | 63 | ||
23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
24 | { | 65 | { |
25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
26 | { | ||
27 | return type < other.type; | ||
28 | } else if (type == tokentype_literal) | ||
29 | { | ||
30 | if (canon == other.canon) | ||
31 | { | ||
32 | return !terminating && other.terminating; | ||
33 | } else { | ||
34 | return canon < other.canon; | ||
35 | } | ||
36 | } else { | ||
37 | return !terminating && other.terminating; | ||
38 | } | ||
39 | } | 67 | } |
40 | }; | 68 | }; |
41 | 69 | ||
42 | enum querytype { | 70 | enum class querytype { |
43 | querytype_literal, | 71 | literal, |
44 | querytype_sentence | 72 | sentence |
45 | }; | 73 | }; |
46 | 74 | ||
47 | struct query { | 75 | struct query { |
48 | querytype type; | 76 | querytype type; |
49 | token word; | 77 | token tok; |
50 | 78 | ||
51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
52 | 80 | ||
53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
54 | 82 | ||
55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
56 | { | 84 | { |
57 | if (type == other.type) | 85 | if (type == other.type) |
58 | { | 86 | { |
59 | return word < other.word; | 87 | return tok < other.tok; |
60 | } else { | 88 | } else { |
61 | return type < other.type; | 89 | return type < other.type; |
62 | } | 90 | } |
@@ -65,34 +93,11 @@ struct query { | |||
65 | 93 | ||
66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
67 | 95 | ||
68 | struct termstats { | ||
69 | char terminator; | ||
70 | int occurrences; | ||
71 | |||
72 | termstats() : terminator('.'), occurrences(1) {} | ||
73 | |||
74 | termstats(char terminator, int occurrences) | ||
75 | { | ||
76 | this->terminator = terminator; | ||
77 | this->occurrences = occurrences; | ||
78 | } | ||
79 | |||
80 | bool operator<(const termstats& other) const | ||
81 | { | ||
82 | if (terminator == other.terminator) | ||
83 | { | ||
84 | return occurrences < other.occurrences; | ||
85 | } else { | ||
86 | return terminator < other.terminator; | ||
87 | } | ||
88 | } | ||
89 | }; | ||
90 | |||
91 | class kgramstats | 96 | class kgramstats |
92 | { | 97 | { |
93 | public: | 98 | public: |
94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
96 | 101 | ||
97 | private: | 102 | private: |
98 | struct token_data | 103 | struct token_data |
@@ -100,16 +105,15 @@ private: | |||
100 | int all; | 105 | int all; |
101 | int titlecase; | 106 | int titlecase; |
102 | int uppercase; | 107 | int uppercase; |
103 | token word; | 108 | token tok; |
104 | 109 | ||
105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
106 | }; | 111 | }; |
107 | 112 | ||
108 | int maxK; | 113 | int maxK; |
109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
112 | std::vector<std::string> hashtags; | ||
113 | }; | 117 | }; |
114 | 118 | ||
115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |