about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
commitb316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
treef21bd883ef7c4255a91d096ea105feaad135ee52
parentfd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
downloadrawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags.

Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
-rw-r--r--CMakeLists.txt8
-rw-r--r--ebooks.cpp15
-rw-r--r--freevars.cpp4
-rw-r--r--gen.cpp15
-rw-r--r--histogram.cpp34
-rw-r--r--histogram.h19
-rw-r--r--kgramstats.cpp453
-rw-r--r--kgramstats.h124
8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -8,10 +8,14 @@ find_package(curl)
8if (YamlCpp_FOUND AND CURL_FOUND) 8if (YamlCpp_FOUND AND CURL_FOUND)
9 add_subdirectory(vendor/twitcurl/libtwitcurl) 9 add_subdirectory(vendor/twitcurl/libtwitcurl)
10 include_directories(vendor/twitcurl/libtwitcurl) 10 include_directories(vendor/twitcurl/libtwitcurl)
11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) 11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp)
12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
12 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) 14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
13else (YamlCpp_FOUND AND CURL_FOUND) 15else (YamlCpp_FOUND AND CURL_FOUND)
14 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") 16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
15endif (YamlCpp_FOUND AND CURL_FOUND) 17endif (YamlCpp_FOUND AND CURL_FOUND)
16 18
17add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) 19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp)
20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -44,20 +44,9 @@ int main(int argc, char** args)
44 std::cout << "Generating..." << std::endl; 44 std::cout << "Generating..." << std::endl;
45 for (;;) 45 for (;;)
46 { 46 {
47 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); 47 std::string doc = stats->randomSentence(rand() % 45 + 5);
48 std::string hi; 48 std::string hi = vars->parse(doc);
49 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
50 {
51 hi += vars->parse(*it) + " ";
52 }
53
54 hi.resize(140); 49 hi.resize(140);
55
56 size_t lastperiod = hi.find_last_of(".!?,");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 {
59 hi = hi.substr(0, lastperiod+1);
60 }
61 50
62 std::string replyMsg; 51 std::string replyMsg;
63 if (twitter.statusUpdate(hi)) 52 if (twitter.statusUpdate(hi))
diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp
@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in)
34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) 34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
35 { 35 {
36 std::string tofind = "$" + it->first + "$"; 36 std::string tofind = "$" + it->first + "$";
37 size_t fpos = res.find(tofind); 37 size_t fpos;
38 if (fpos != std::string::npos) 38 while ((fpos = res.find(tofind)) != std::string::npos)
39 { 39 {
40 int r = rand() % it->second->size(); 40 int r = rand() % it->second->size();
41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); 41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -52,21 +52,10 @@ int main(int argc, char** args)
52 std::cout << "Generating..." << std::endl; 52 std::cout << "Generating..." << std::endl;
53 for (;;) 53 for (;;)
54 { 54 {
55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); 55 std::string doc = stats->randomSentence(rand() % 35 + 15);
56 std::string hi; 56 std::string hi = vars->parse(doc);
57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58 {
59 hi += vars->parse(*it) + " ";
60 }
61
62 hi.resize(140); 57 hi.resize(140);
63 58
64 size_t lastperiod = hi.find_last_of(".!?,");
65 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
66 {
67 hi = hi.substr(0, lastperiod+1);
68 }
69
70 std::cout << hi << std::endl; 59 std::cout << hi << std::endl;
71 60
72 getc(stdin); 61 getc(stdin);
diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp
@@ -0,0 +1,34 @@
1#include "histogram.h"
2#include <cstdlib>
3
4template <class T>
5void histogram<T>::add(const T& inst)
6{
7 freqtable[inst]++;
8}
9
10template <class T>
11void histogram<T>::compile()
12{
13 distribution.clear();
14
15 int max = 0;
16 for (auto& it : freqtable)
17 {
18 max += it.second;
19 distribution.emplace(max, it.first);
20 }
21
22 freqtable.clear();
23}
24
25template <class T>
26const T& histogram<T>::next() const
27{
28 int max = distribution.rbegin()->first;
29 int r = rand() % max;
30
31 return distribution.upper_bound(r)->second;
32}
33
34template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h
@@ -0,0 +1,19 @@
1#ifndef HISTOGRAM_H_24094D97
2#define HISTOGRAM_H_24094D97
3
4#include <map>
5#include <string>
6
7template <class T>
8class histogram {
9 public:
10 void add(const T& inst);
11 void compile();
12 const T& next() const;
13
14 private:
15 std::map<T, int> freqtable;
16 std::map<int, T> distribution;
17};
18
19#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37#include <iostream> 37#include <iostream>
38#include <cstdlib> 38#include <cstdlib>
39#include <algorithm> 39#include <algorithm>
40#include "malaprop.h" 40#include <set>
41#include <stack>
41 42
42query wildcardQuery(querytype_sentence); 43query wildcardQuery {querytype::sentence};
43 44word blank_word {""};
44std::string canonize(std::string f);
45
46token token_from_string(std::string in)
47{
48 if (in[0] == '#')
49 {
50 token word(tokentype_hashtag);
51
52 if (in.find_first_of(".?!,") != std::string::npos)
53 {
54 word.terminating = true;
55 }
56
57 return word;
58 } else {
59 token word(canonize(in));
60
61 if (in.find_first_of(".?!,") != std::string::npos)
62 {
63 word.terminating = true;
64 }
65
66 return word;
67 }
68}
69 45
70// runs in O(t^2) time where t is the number of tokens in the input corpus 46// runs in O(t^2) time where t is the number of tokens in the input corpus
71// We consider maxK to be fairly constant 47// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73{ 49{
74 this->maxK = maxK; 50 this->maxK = maxK;
75 51
76 std::vector<std::string> tokens; 52 std::vector<token> tokens;
77 size_t start = 0; 53 size_t start = 0;
78 int end = 0; 54 int end = 0;
79 std::set<std::string> thashtags; 55 std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82 { 58 {
83 end = corpus.find(" ", start); 59 end = corpus.find(" ", start);
84 60
85 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86 if (token[token.length()-1] == '\n') 62 if (t.compare("") && t.compare("."))
87 { 63 {
88 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) 64 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
67 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
68 });
69
70 word& w = ([&] () -> word& {
71 // Hashtag freevar
72 if (canonical[0] == '#')
73 {
74 thashtags.insert(canonical);
75 canonical = "#hashtag";
76
77 return hashtags;
78 }
79
80 // Basically any other word
81 if (words.count(canonical) == 0)
82 {
83 words.emplace(canonical, canonical);
84 }
85
86 word& tw = words.at(canonical);
87 tw.forms.add(canonical);
88
89 return tw;
90 })();
91
92 token tk(w);
93 tk.raw = t;
94
95 for (char c : t)
89 { 96 {
90 token.insert(token.length()-1, "."); 97 if (c == '*')
98 {
99 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
100 } else if (c == '[')
101 {
102 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
103 } else if (c == '(')
104 {
105 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
106 } else if (c == '"')
107 {
108 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
109 } else {
110 break;
111 }
91 } 112 }
92
93 token.resize(token.length()-1);
94 }
95
96 if (token.compare("") && token.compare("."))
97 {
98 mstats.addWord(token);
99 tokens.push_back(token);
100 113
101 if (token[0] == '#') 114 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
115 if (backtrack != t.length())
102 { 116 {
103 thashtags.insert(canonize(token)); 117 std::string ending = t.substr(backtrack);
118 std::string suffix;
119
120 for (char c : ending)
121 {
122 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
123 {
124 suffix += c;
125
126 continue;
127 } else if (c == '\n')
128 {
129 // At least the end is coming
130 if (suffix.empty())
131 {
132 suffix = ".";
133 }
134
135 break;
136 }
137
138 parentype pt = ([&] {
139 switch (c)
140 {
141 case ']': return parentype::square_bracket;
142 case ')': return parentype::paren;
143 case '*': return parentype::asterisk;
144 case '"': return parentype::quote;
145 }
146 })();
147
148 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
149 {
150 tk.delimiters[{pt, doublestatus::opening}]--;
151 tk.delimiters[{pt, doublestatus::both}]++;
152 } else {
153 tk.delimiters[{pt, doublestatus::closing}]++;
154 }
155 }
156
157 if (suffix == ",")
158 {
159 tk.suffix = suffixtype::comma;
160 } else if (!suffix.empty()) {
161 tk.suffix = suffixtype::terminating;
162
163 w.terms.add(suffix);
164 }
104 } 165 }
166
167 tokens.push_back(tk);
105 } 168 }
106 169
107 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 170 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108 } 171 }
109 172
110 for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) 173 // Time to condense the distribution stuff for the words
174 for (auto& it : words)
111 { 175 {
112 hashtags.push_back(*it); 176 it.second.forms.compile();
177 it.second.terms.compile();
113 } 178 }
114 179
180 // Hashtag freevar is not frequency distributed
181 for (auto& it : thashtags)
182 {
183 hashtags.forms.add(it);
184 }
185
186 hashtags.forms.compile();
187 hashtags.terms.compile();
188
189 // kgram distribution
115 std::map<kgram, std::map<token, token_data> > tstats; 190 std::map<kgram, std::map<token, token_data> > tstats;
116 std::map<token, std::map<termstats, int> > tendings;
117 for (int k=1; k<maxK; k++) 191 for (int k=1; k<maxK; k++)
118 { 192 {
119 for (int i=0; i<(tokens.size() - k); i++) 193 for (int i=0; i<(tokens.size() - k); i++)
120 { 194 {
121 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); 195 kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122 kgram prefix; 196 token f = tokens[i+k];
123 197
124 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) 198 if (tstats[prefix].count(f) == 0)
125 {
126 prefix.push_back(token_from_string(*it));
127 }
128
129 std::string f = tokens[i+k];
130 std::string canonical = canonize(f);
131
132 token word(token_from_string(canonical));
133 if (f.find_first_of(".?!,") != std::string::npos)
134 { 199 {
135 word.terminating = true; 200 tstats[prefix].emplace(f, f);
136
137 char terminator = f[f.find_last_of(".?!,")];
138 int occurrences = std::count(f.begin(), f.end(), terminator);
139
140 tendings[word][termstats(terminator, occurrences)]++;
141 } 201 }
142 202
143 token_data& td = tstats[prefix][word]; 203 token_data& td = tstats[prefix].at(f);
144 td.word = word;
145 td.all++; 204 td.all++;
146 205
147 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 206 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148 { 207 {
149 td.uppercase++; 208 td.uppercase++;
150 } else if (isupper(f[0])) 209 } else if (isupper(f.raw[0]))
151 { 210 {
152 td.titlecase++; 211 td.titlecase++;
153 } 212 }
154 213
155 if (prefix.front().word.terminating) 214 kgram term_prefix;
215 bool changed = false;
216 std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
217 if (q.tok.suffix == suffixtype::terminating)
218 {
219 changed = true;
220
221 return wildcardQuery;
222 } else {
223 return q;
224 }
225 });
226
227 if (changed)
156 { 228 {
157 prefix.front() = wildcardQuery; 229 if (tstats[term_prefix].count(f) == 0)
230 {
231 tstats[term_prefix].emplace(f, f);
232 }
158 233
159 token_data& td2 = tstats[prefix][word]; 234 token_data& td2 = tstats[term_prefix].at(f);
160 td2.word = word;
161 td2.all++; 235 td2.all++;
162 236
163 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 237 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164 { 238 {
165 td2.uppercase++; 239 td2.uppercase++;
166 } else if (isupper(f[0])) 240 } else if (isupper(f.raw[0]))
167 { 241 {
168 td2.titlecase++; 242 td2.titlecase++;
169 } 243 }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 245 }
172 } 246 }
173 247
174 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) 248 // Condense the kgram distribution
249 for (auto& it : tstats)
175 { 250 {
176 kgram klist = it->first; 251 kgram klist = it.first;
177 std::map<token, token_data>& probtable = it->second; 252 auto& probtable = it.second;
178 std::map<int, token_data>& distribution = stats[klist]; 253 auto& distribution = stats[klist];
179 int max = 0; 254 int max = 0;
180 255
181 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) 256 for (auto& kt : probtable)
182 { 257 {
183 max += kt->second.all; 258 max += kt.second.all;
184 259
185 distribution[max] = kt->second; 260 distribution.emplace(max, kt.second);
186 }
187 }
188
189 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190 {
191 token word = it->first;
192 std::map<termstats, int>& probtable = it->second;
193 std::map<int, termstats>& distribution = endings[word];
194 int max = 0;
195
196 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197 {
198 max += kt->second;
199
200 distribution[max] = kt->first;
201 } 261 }
202 } 262 }
203} 263}
204 264
205void printKgram(kgram k) 265void printKgram(kgram k)
206{ 266{
207 for (kgram::iterator it = k.begin(); it != k.end(); it++) 267 for (auto& q : k)
208 { 268 {
209 query& q = *it; 269 if (q.type == querytype::sentence)
210 if (q.type == querytype_sentence)
211 { 270 {
212 std::cout << "#.# "; 271 std::cout << "#.# ";
213 } else if (q.type == querytype_literal) 272 } else if (q.type == querytype::literal)
214 { 273 {
215 if (q.word.type == tokentype_hashtag) 274 if (q.tok.suffix == suffixtype::terminating)
216 { 275 {
217 if (q.word.terminating) 276 std::cout << q.tok.w.canon << ". ";
218 { 277 } else if (q.tok.suffix == suffixtype::comma)
219 std::cout << "#hashtag. ";
220 } else {
221 std::cout << "#hashtag ";
222 }
223 } else if (q.word.type == tokentype_literal)
224 { 278 {
225 if (q.word.terminating) 279 std::cout << q.tok.w.canon << ", ";
226 { 280 } else {
227 std::cout << q.word.canon << ". "; 281 std::cout << q.tok.w.canon << " ";
228 } else {
229 std::cout << q.word.canon << " ";
230 }
231 } 282 }
232 } 283 }
233 } 284 }
234} 285}
235 286
236// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 287// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237std::vector<std::string> kgramstats::randomSentence(int n) 288std::string kgramstats::randomSentence(int n)
238{ 289{
239 std::vector<std::string> result; 290 std::string result;
240 kgram cur(1, wildcardQuery); 291 kgram cur(1, wildcardQuery);
241 int cuts = 0; 292 int cuts = 0;
293 std::stack<parentype> open_delimiters;
242 294
243 for (int i=0; i<n; i++) 295 for (int i=0; i<n; i++)
244 { 296 {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273 cur = kgram(1, wildcardQuery); 325 cur = kgram(1, wildcardQuery);
274 } 326 }
275 327
276 std::map<int, token_data>& distribution = stats[cur]; 328 auto& distribution = stats[cur];
277 int max = distribution.rbegin()->first; 329 int max = distribution.rbegin()->first;
278 int r = rand() % max; 330 int r = rand() % max;
279 token_data& next = distribution.upper_bound(r)->second; 331 token_data& next = distribution.upper_bound(r)->second;
280 std::string nextToken; 332 std::string nextToken = next.tok.w.forms.next();
281 bool mess = false; 333
282 334 // Determine the casing of the next token. We randomly make the token all
283 if (next.word.type == tokentype_literal) 335 // caps based on the markov chain. Otherwise, we check if the previous
336 // token is the end of a sentence (terminating token or a wildcard query).
337 int casing = rand() % next.all;
338 if (casing < next.uppercase)
339 {
340 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
341 } else if ((((cur.rbegin()->type == querytype::sentence)
342 || ((cur.rbegin()->type == querytype::literal)
343 && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
344 && (rand() % 2 > 0))
345 || (casing - next.uppercase < next.titlecase))
284 { 346 {
285 nextToken = next.word.canon; 347 nextToken[0] = toupper(nextToken[0]);
348 }
286 349
287 mess = (rand() % 100) == 0; 350 // Delimiters
288 if (mess) 351 for (auto& dt : next.tok.delimiters)
352 {
353 if (dt.first.status == doublestatus::both)
289 { 354 {
290 nextToken = mstats.alternate(nextToken); 355 switch (dt.first.type)
291 } 356 {
292 357 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293 // Determine the casing of the next token. We randomly make the token all 358 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294 // caps based on the markov chain. Otherwise, we check if the previous 359 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
295 // token is the end of a sentence (terminating token or a wildcard query). 360 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296 int casing = rand() % next.all; 361 }
297 if (casing < next.uppercase) 362 } else if (dt.first.status == doublestatus::opening)
298 { 363 {
299 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 364 for (int i=0; i<dt.second; i++)
300 } else if ((((cur.rbegin()->type == querytype_sentence) 365 {
301 || ((cur.rbegin()->type == querytype_literal) 366 open_delimiters.push(dt.first.type);
302 && (cur.rbegin()->word.terminating))) 367 }
303 && (rand() % 2 > 0)) 368
304 || (casing - next.uppercase < next.titlecase)) 369 switch (dt.first.type)
370 {
371 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
372 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
373 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
374 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
375 }
376 } else if (dt.first.status == doublestatus::closing)
305 { 377 {
306 nextToken[0] = toupper(nextToken[0]); 378 for (int i=0; i<dt.second; i++)
379 {
380 while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
381 {
382 switch (open_delimiters.top())
383 {
384 case parentype::paren: nextToken.append(")"); break;
385 case parentype::square_bracket: nextToken.append("]"); break;
386 case parentype::asterisk: nextToken.append("*"); break;
387 case parentype::quote: nextToken.append("\""); break;
388 }
389
390 open_delimiters.pop();
391 }
392
393 if (open_delimiters.empty())
394 {
395 switch (dt.first.type)
396 {
397 case parentype::paren: result = "(" + result; break;
398 case parentype::square_bracket: result = "[" + result; break;
399 case parentype::asterisk: result = "*" + result; break;
400 case parentype::quote: result = "\"" + result; break;
401 }
402 }
403
404 switch (dt.first.type)
405 {
406 case parentype::paren: nextToken.append(")"); break;
407 case parentype::square_bracket: nextToken.append("]"); break;
408 case parentype::asterisk: nextToken.append("*"); break;
409 case parentype::quote: nextToken.append("\""); break;
410 }
411 }
307 } 412 }
308 } else if (next.word.type == tokentype_hashtag)
309 {
310 int rhash = rand() % hashtags.size();
311 nextToken = hashtags[rhash];
312 } 413 }
313 414
314 if (next.word.terminating) 415 // Terminators
416 if (next.tok.suffix == suffixtype::terminating)
315 { 417 {
316 std::map<int, termstats>& ending = endings[next.word]; 418 nextToken.append(next.tok.w.terms.next());
317 int emax = ending.rbegin()->first; 419 } else if (next.tok.suffix == suffixtype::comma)
318 int er = rand() % emax; 420 {
319 termstats& nextend = ending.upper_bound(er)->second; 421 nextToken.append(",");
320
321 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322 } 422 }
323 423
324 /* DEBUG */ 424 /* DEBUG */
325 printKgram(cur); 425 printKgram(cur);
426 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
427
428 cur.push_back(next.tok);
326 429
327 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; 430 result.append(nextToken + " ");
328 431
329 if (mess) 432 if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330 { 433 {
331 std::cout << " mala " << next.word.canon; 434 break;
332 } 435 }
333
334 std::cout << std::endl;
335
336 cur.push_back(next.word);
337
338 result.push_back(nextToken);
339 } 436 }
340
341 return result;
342}
343
344bool removeIf(char c)
345{
346 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
347}
348
349std::string canonize(std::string f)
350{
351 std::string canonical(f);
352 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353 437
354 std::string result; 438 // Remove the trailing space
355 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 439 if (result.back() == ' ')
440 {
441 result.pop_back();
442 }
443
444 // Close any open delimiters
445 while (!open_delimiters.empty())
446 {
447 switch (open_delimiters.top())
448 {
449 case parentype::paren: result.append(")"); break;
450 case parentype::square_bracket: result.append("]"); break;
451 case parentype::asterisk: result.append("*"); break;
452 case parentype::quote: result.append("\""); break;
453 }
454
455 open_delimiters.pop();
456 }
356 457
357 return result; 458 return result;
358} 459}
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2#include <map> 2#include <map>
3#include <list> 3#include <list>
4#include <vector> 4#include <vector>
5#include "malaprop.h" 5#include "histogram.h"
6 6
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10enum tokentype { 10struct word {
11 tokentype_literal, 11 std::string canon;
12 tokentype_hashtag 12 histogram<std::string> forms;
13 histogram<std::string> terms;
14
15 word(std::string canon) : canon(canon) {}
16
17 bool operator<(const word& other) const
18 {
19 return canon < other.canon;
20 }
13}; 21};
14 22
15struct token { 23extern word blank_word;
16 tokentype type; 24
17 std::string canon; 25enum class suffixtype {
18 bool terminating; 26 none,
27 terminating,
28 comma
29};
30
31enum class parentype {
32 paren,
33 square_bracket,
34 asterisk,
35 quote
36};
37
38enum class doublestatus {
39 opening,
40 closing,
41 both
42};
43
44struct delimiter {
45 parentype type;
46 doublestatus status;
47
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19 49
20 token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} 50 bool operator<(const delimiter& other) const
21 token(tokentype type) : type(type), canon(""), terminating(false) {} 51 {
52 return std::tie(type, status) < std::tie(other.type, other.status);
53 }
54};
55
56struct token {
57 const word& w;
58 std::map<delimiter, int> delimiters;
59 suffixtype suffix;
60 std::string raw;
61
62 token(const word& w) : w(w), suffix(suffixtype::none) {}
22 63
23 bool operator<(const token& other) const 64 bool operator<(const token& other) const
24 { 65 {
25 if (type != other.type) 66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26 {
27 return type < other.type;
28 } else if (type == tokentype_literal)
29 {
30 if (canon == other.canon)
31 {
32 return !terminating && other.terminating;
33 } else {
34 return canon < other.canon;
35 }
36 } else {
37 return !terminating && other.terminating;
38 }
39 } 67 }
40}; 68};
41 69
42enum querytype { 70enum class querytype {
43 querytype_literal, 71 literal,
44 querytype_sentence 72 sentence
45}; 73};
46 74
47struct query { 75struct query {
48 querytype type; 76 querytype type;
49 token word; 77 token tok;
50 78
51 query(token word) : word(word), type(querytype_literal) {} 79 query(token tok) : tok(tok), type(querytype::literal) {}
52 80
53 query(querytype type) : word(""), type(type) {} 81 query(querytype type) : tok(blank_word), type(type) {}
54 82
55 bool operator<(const query& other) const 83 bool operator<(const query& other) const
56 { 84 {
57 if (type == other.type) 85 if (type == other.type)
58 { 86 {
59 return word < other.word; 87 return tok < other.tok;
60 } else { 88 } else {
61 return type < other.type; 89 return type < other.type;
62 } 90 }
@@ -65,34 +93,11 @@ struct query {
65 93
66typedef std::list<query> kgram; 94typedef std::list<query> kgram;
67 95
68struct termstats {
69 char terminator;
70 int occurrences;
71
72 termstats() : terminator('.'), occurrences(1) {}
73
74 termstats(char terminator, int occurrences)
75 {
76 this->terminator = terminator;
77 this->occurrences = occurrences;
78 }
79
80 bool operator<(const termstats& other) const
81 {
82 if (terminator == other.terminator)
83 {
84 return occurrences < other.occurrences;
85 } else {
86 return terminator < other.terminator;
87 }
88 }
89};
90
91class kgramstats 96class kgramstats
92{ 97{
93public: 98public:
94 kgramstats(std::string corpus, int maxK); 99 kgramstats(std::string corpus, int maxK);
95 std::vector<std::string> randomSentence(int n); 100 std::string randomSentence(int n);
96 101
97private: 102private:
98 struct token_data 103 struct token_data
@@ -100,16 +105,15 @@ private:
100 int all; 105 int all;
101 int titlecase; 106 int titlecase;
102 int uppercase; 107 int uppercase;
103 token word; 108 token tok;
104 109
105 token_data() : word(""), all(0), titlecase(0), uppercase(0) {} 110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106 }; 111 };
107 112
108 int maxK; 113 int maxK;
109 std::map<kgram, std::map<int, token_data> > stats; 114 std::map<kgram, std::map<int, token_data> > stats;
110 malaprop mstats; 115 word hashtags {"#hashtag"};
111 std::map<token, std::map<int, termstats> > endings; 116 std::map<std::string, word> words;
112 std::vector<std::string> hashtags;
113}; 117};
114 118
115void printKgram(kgram k); 119void printKgram(kgram k);