about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt8
-rw-r--r--ebooks.cpp15
-rw-r--r--freevars.cpp4
-rw-r--r--gen.cpp15
-rw-r--r--histogram.cpp34
-rw-r--r--histogram.h19
-rw-r--r--kgramstats.cpp453
-rw-r--r--kgramstats.h124
8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index aa63a34..41c4552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -8,10 +8,14 @@ find_package(curl)
8if (YamlCpp_FOUND AND CURL_FOUND) 8if (YamlCpp_FOUND AND CURL_FOUND)
9 add_subdirectory(vendor/twitcurl/libtwitcurl) 9 add_subdirectory(vendor/twitcurl/libtwitcurl)
10 include_directories(vendor/twitcurl/libtwitcurl) 10 include_directories(vendor/twitcurl/libtwitcurl)
11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp) 11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp)
12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
12 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) 14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
13else (YamlCpp_FOUND AND CURL_FOUND) 15else (YamlCpp_FOUND AND CURL_FOUND)
14 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") 16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
15endif (YamlCpp_FOUND AND CURL_FOUND) 17endif (YamlCpp_FOUND AND CURL_FOUND)
16 18
17add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp) 19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp)
20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp index e38ebab..ed1e080 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -44,20 +44,9 @@ int main(int argc, char** args)
44 std::cout << "Generating..." << std::endl; 44 std::cout << "Generating..." << std::endl;
45 for (;;) 45 for (;;)
46 { 46 {
47 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); 47 std::string doc = stats->randomSentence(rand() % 45 + 5);
48 std::string hi; 48 std::string hi = vars->parse(doc);
49 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
50 {
51 hi += vars->parse(*it) + " ";
52 }
53
54 hi.resize(140); 49 hi.resize(140);
55
56 size_t lastperiod = hi.find_last_of(".!?,");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 {
59 hi = hi.substr(0, lastperiod+1);
60 }
61 50
62 std::string replyMsg; 51 std::string replyMsg;
63 if (twitter.statusUpdate(hi)) 52 if (twitter.statusUpdate(hi))
diff --git a/freevars.cpp b/freevars.cpp index 8c3eda4..54c5aab 100644 --- a/freevars.cpp +++ b/freevars.cpp
@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in)
34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) 34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
35 { 35 {
36 std::string tofind = "$" + it->first + "$"; 36 std::string tofind = "$" + it->first + "$";
37 size_t fpos = res.find(tofind); 37 size_t fpos;
38 if (fpos != std::string::npos) 38 while ((fpos = res.find(tofind)) != std::string::npos)
39 { 39 {
40 int r = rand() % it->second->size(); 40 int r = rand() % it->second->size();
41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); 41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
diff --git a/gen.cpp b/gen.cpp index 400c0a5..a0ef8e3 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -52,21 +52,10 @@ int main(int argc, char** args)
52 std::cout << "Generating..." << std::endl; 52 std::cout << "Generating..." << std::endl;
53 for (;;) 53 for (;;)
54 { 54 {
55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); 55 std::string doc = stats->randomSentence(rand() % 35 + 15);
56 std::string hi; 56 std::string hi = vars->parse(doc);
57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58 {
59 hi += vars->parse(*it) + " ";
60 }
61
62 hi.resize(140); 57 hi.resize(140);
63 58
64 size_t lastperiod = hi.find_last_of(".!?,");
65 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
66 {
67 hi = hi.substr(0, lastperiod+1);
68 }
69
70 std::cout << hi << std::endl; 59 std::cout << hi << std::endl;
71 60
72 getc(stdin); 61 getc(stdin);
diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..6896146 --- /dev/null +++ b/histogram.cpp
@@ -0,0 +1,34 @@
1#include "histogram.h"
2#include <cstdlib>
3
4template <class T>
5void histogram<T>::add(const T& inst)
6{
7 freqtable[inst]++;
8}
9
10template <class T>
11void histogram<T>::compile()
12{
13 distribution.clear();
14
15 int max = 0;
16 for (auto& it : freqtable)
17 {
18 max += it.second;
19 distribution.emplace(max, it.first);
20 }
21
22 freqtable.clear();
23}
24
25template <class T>
26const T& histogram<T>::next() const
27{
28 int max = distribution.rbegin()->first;
29 int r = rand() % max;
30
31 return distribution.upper_bound(r)->second;
32}
33
34template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..5aa2560 --- /dev/null +++ b/histogram.h
@@ -0,0 +1,19 @@
1#ifndef HISTOGRAM_H_24094D97
2#define HISTOGRAM_H_24094D97
3
4#include <map>
5#include <string>
6
7template <class T>
8class histogram {
9 public:
10 void add(const T& inst);
11 void compile();
12 const T& next() const;
13
14 private:
15 std::map<T, int> freqtable;
16 std::map<int, T> distribution;
17};
18
19#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37#include <iostream> 37#include <iostream>
38#include <cstdlib> 38#include <cstdlib>
39#include <algorithm> 39#include <algorithm>
40#include "malaprop.h" 40#include <set>
41#include <stack>
41 42
42query wildcardQuery(querytype_sentence); 43query wildcardQuery {querytype::sentence};
43 44word blank_word {""};
44std::string canonize(std::string f);
45
46token token_from_string(std::string in)
47{
48 if (in[0] == '#')
49 {
50 token word(tokentype_hashtag);
51
52 if (in.find_first_of(".?!,") != std::string::npos)
53 {
54 word.terminating = true;
55 }
56
57 return word;
58 } else {
59 token word(canonize(in));
60
61 if (in.find_first_of(".?!,") != std::string::npos)
62 {
63 word.terminating = true;
64 }
65
66 return word;
67 }
68}
69 45
70// runs in O(t^2) time where t is the number of tokens in the input corpus 46// runs in O(t^2) time where t is the number of tokens in the input corpus
71// We consider maxK to be fairly constant 47// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73{ 49{
74 this->maxK = maxK; 50 this->maxK = maxK;
75 51
76 std::vector<std::string> tokens; 52 std::vector<token> tokens;
77 size_t start = 0; 53 size_t start = 0;
78 int end = 0; 54 int end = 0;
79 std::set<std::string> thashtags; 55 std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82 { 58 {
83 end = corpus.find(" ", start); 59 end = corpus.find(" ", start);
84 60
85 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86 if (token[token.length()-1] == '\n') 62 if (t.compare("") && t.compare("."))
87 { 63 {
88 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) 64 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
67 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
68 });
69
70 word& w = ([&] () -> word& {
71 // Hashtag freevar
72 if (canonical[0] == '#')
73 {
74 thashtags.insert(canonical);
75 canonical = "#hashtag";
76
77 return hashtags;
78 }
79
80 // Basically any other word
81 if (words.count(canonical) == 0)
82 {
83 words.emplace(canonical, canonical);
84 }
85
86 word& tw = words.at(canonical);
87 tw.forms.add(canonical);
88
89 return tw;
90 })();
91
92 token tk(w);
93 tk.raw = t;
94
95 for (char c : t)
89 { 96 {
90 token.insert(token.length()-1, "."); 97 if (c == '*')
98 {
99 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
100 } else if (c == '[')
101 {
102 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
103 } else if (c == '(')
104 {
105 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
106 } else if (c == '"')
107 {
108 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
109 } else {
110 break;
111 }
91 } 112 }
92
93 token.resize(token.length()-1);
94 }
95
96 if (token.compare("") && token.compare("."))
97 {
98 mstats.addWord(token);
99 tokens.push_back(token);
100 113
101 if (token[0] == '#') 114 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
115 if (backtrack != t.length())
102 { 116 {
103 thashtags.insert(canonize(token)); 117 std::string ending = t.substr(backtrack);
118 std::string suffix;
119
120 for (char c : ending)
121 {
122 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
123 {
124 suffix += c;
125
126 continue;
127 } else if (c == '\n')
128 {
129 // At least the end is coming
130 if (suffix.empty())
131 {
132 suffix = ".";
133 }
134
135 break;
136 }
137
138 parentype pt = ([&] {
139 switch (c)
140 {
141 case ']': return parentype::square_bracket;
142 case ')': return parentype::paren;
143 case '*': return parentype::asterisk;
144 case '"': return parentype::quote;
145 }
146 })();
147
148 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
149 {
150 tk.delimiters[{pt, doublestatus::opening}]--;
151 tk.delimiters[{pt, doublestatus::both}]++;
152 } else {
153 tk.delimiters[{pt, doublestatus::closing}]++;
154 }
155 }
156
157 if (suffix == ",")
158 {
159 tk.suffix = suffixtype::comma;
160 } else if (!suffix.empty()) {
161 tk.suffix = suffixtype::terminating;
162
163 w.terms.add(suffix);
164 }
104 } 165 }
166
167 tokens.push_back(tk);
105 } 168 }
106 169
107 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 170 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108 } 171 }
109 172
110 for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) 173 // Time to condense the distribution stuff for the words
174 for (auto& it : words)
111 { 175 {
112 hashtags.push_back(*it); 176 it.second.forms.compile();
177 it.second.terms.compile();
113 } 178 }
114 179
180 // Hashtag freevar is not frequency distributed
181 for (auto& it : thashtags)
182 {
183 hashtags.forms.add(it);
184 }
185
186 hashtags.forms.compile();
187 hashtags.terms.compile();
188
189 // kgram distribution
115 std::map<kgram, std::map<token, token_data> > tstats; 190 std::map<kgram, std::map<token, token_data> > tstats;
116 std::map<token, std::map<termstats, int> > tendings;
117 for (int k=1; k<maxK; k++) 191 for (int k=1; k<maxK; k++)
118 { 192 {
119 for (int i=0; i<(tokens.size() - k); i++) 193 for (int i=0; i<(tokens.size() - k); i++)
120 { 194 {
121 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); 195 kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122 kgram prefix; 196 token f = tokens[i+k];
123 197
124 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) 198 if (tstats[prefix].count(f) == 0)
125 {
126 prefix.push_back(token_from_string(*it));
127 }
128
129 std::string f = tokens[i+k];
130 std::string canonical = canonize(f);
131
132 token word(token_from_string(canonical));
133 if (f.find_first_of(".?!,") != std::string::npos)
134 { 199 {
135 word.terminating = true; 200 tstats[prefix].emplace(f, f);
136
137 char terminator = f[f.find_last_of(".?!,")];
138 int occurrences = std::count(f.begin(), f.end(), terminator);
139
140 tendings[word][termstats(terminator, occurrences)]++;
141 } 201 }
142 202
143 token_data& td = tstats[prefix][word]; 203 token_data& td = tstats[prefix].at(f);
144 td.word = word;
145 td.all++; 204 td.all++;
146 205
147 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 206 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148 { 207 {
149 td.uppercase++; 208 td.uppercase++;
150 } else if (isupper(f[0])) 209 } else if (isupper(f.raw[0]))
151 { 210 {
152 td.titlecase++; 211 td.titlecase++;
153 } 212 }
154 213
155 if (prefix.front().word.terminating) 214 kgram term_prefix;
215 bool changed = false;
216 std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
217 if (q.tok.suffix == suffixtype::terminating)
218 {
219 changed = true;
220
221 return wildcardQuery;
222 } else {
223 return q;
224 }
225 });
226
227 if (changed)
156 { 228 {
157 prefix.front() = wildcardQuery; 229 if (tstats[term_prefix].count(f) == 0)
230 {
231 tstats[term_prefix].emplace(f, f);
232 }
158 233
159 token_data& td2 = tstats[prefix][word]; 234 token_data& td2 = tstats[term_prefix].at(f);
160 td2.word = word;
161 td2.all++; 235 td2.all++;
162 236
163 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 237 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164 { 238 {
165 td2.uppercase++; 239 td2.uppercase++;
166 } else if (isupper(f[0])) 240 } else if (isupper(f.raw[0]))
167 { 241 {
168 td2.titlecase++; 242 td2.titlecase++;
169 } 243 }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 245 }
172 } 246 }
173 247
174 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) 248 // Condense the kgram distribution
249 for (auto& it : tstats)
175 { 250 {
176 kgram klist = it->first; 251 kgram klist = it.first;
177 std::map<token, token_data>& probtable = it->second; 252 auto& probtable = it.second;
178 std::map<int, token_data>& distribution = stats[klist]; 253 auto& distribution = stats[klist];
179 int max = 0; 254 int max = 0;
180 255
181 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) 256 for (auto& kt : probtable)
182 { 257 {
183 max += kt->second.all; 258 max += kt.second.all;
184 259
185 distribution[max] = kt->second; 260 distribution.emplace(max, kt.second);
186 }
187 }
188
189 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190 {
191 token word = it->first;
192 std::map<termstats, int>& probtable = it->second;
193 std::map<int, termstats>& distribution = endings[word];
194 int max = 0;
195
196 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197 {
198 max += kt->second;
199
200 distribution[max] = kt->first;
201 } 261 }
202 } 262 }
203} 263}
204 264
205void printKgram(kgram k) 265void printKgram(kgram k)
206{ 266{
207 for (kgram::iterator it = k.begin(); it != k.end(); it++) 267 for (auto& q : k)
208 { 268 {
209 query& q = *it; 269 if (q.type == querytype::sentence)
210 if (q.type == querytype_sentence)
211 { 270 {
212 std::cout << "#.# "; 271 std::cout << "#.# ";
213 } else if (q.type == querytype_literal) 272 } else if (q.type == querytype::literal)
214 { 273 {
215 if (q.word.type == tokentype_hashtag) 274 if (q.tok.suffix == suffixtype::terminating)
216 { 275 {
217 if (q.word.terminating) 276 std::cout << q.tok.w.canon << ". ";
218 { 277 } else if (q.tok.suffix == suffixtype::comma)
219 std::cout << "#hashtag. ";
220 } else {
221 std::cout << "#hashtag ";
222 }
223 } else if (q.word.type == tokentype_literal)
224 { 278 {
225 if (q.word.terminating) 279 std::cout << q.tok.w.canon << ", ";
226 { 280 } else {
227 std::cout << q.word.canon << ". "; 281 std::cout << q.tok.w.canon << " ";
228 } else {
229 std::cout << q.word.canon << " ";
230 }
231 } 282 }
232 } 283 }
233 } 284 }
234} 285}
235 286
236// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 287// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237std::vector<std::string> kgramstats::randomSentence(int n) 288std::string kgramstats::randomSentence(int n)
238{ 289{
239 std::vector<std::string> result; 290 std::string result;
240 kgram cur(1, wildcardQuery); 291 kgram cur(1, wildcardQuery);
241 int cuts = 0; 292 int cuts = 0;
293 std::stack<parentype> open_delimiters;
242 294
243 for (int i=0; i<n; i++) 295 for (int i=0; i<n; i++)
244 { 296 {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273 cur = kgram(1, wildcardQuery); 325 cur = kgram(1, wildcardQuery);
274 } 326 }
275 327
276 std::map<int, token_data>& distribution = stats[cur]; 328 auto& distribution = stats[cur];
277 int max = distribution.rbegin()->first; 329 int max = distribution.rbegin()->first;
278 int r = rand() % max; 330 int r = rand() % max;
279 token_data& next = distribution.upper_bound(r)->second; 331 token_data& next = distribution.upper_bound(r)->second;
280 std::string nextToken; 332 std::string nextToken = next.tok.w.forms.next();
281 bool mess = false; 333
282 334 // Determine the casing of the next token. We randomly make the token all
283 if (next.word.type == tokentype_literal) 335 // caps based on the markov chain. Otherwise, we check if the previous
336 // token is the end of a sentence (terminating token or a wildcard query).
337 int casing = rand() % next.all;
338 if (casing < next.uppercase)
339 {
340 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
341 } else if ((((cur.rbegin()->type == querytype::sentence)
342 || ((cur.rbegin()->type == querytype::literal)
343 && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
344 && (rand() % 2 > 0))
345 || (casing - next.uppercase < next.titlecase))
284 { 346 {
285 nextToken = next.word.canon; 347 nextToken[0] = toupper(nextToken[0]);
348 }
286 349
287 mess = (rand() % 100) == 0; 350 // Delimiters
288 if (mess) 351 for (auto& dt : next.tok.delimiters)
352 {
353 if (dt.first.status == doublestatus::both)
289 { 354 {
290 nextToken = mstats.alternate(nextToken); 355 switch (dt.first.type)
291 } 356 {
292 357 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293 // Determine the casing of the next token. We randomly make the token all 358 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294 // caps based on the markov chain. Otherwise, we check if the previous 359 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
295 // token is the end of a sentence (terminating token or a wildcard query). 360 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296 int casing = rand() % next.all; 361 }
297 if (casing < next.uppercase) 362 } else if (dt.first.status == doublestatus::opening)
298 { 363 {
299 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 364 for (int i=0; i<dt.second; i++)
300 } else if ((((cur.rbegin()->type == querytype_sentence) 365 {
301 || ((cur.rbegin()->type == querytype_literal) 366 open_delimiters.push(dt.first.type);
302 && (cur.rbegin()->word.terminating))) 367 }
303 && (rand() % 2 > 0)) 368
304 || (casing - next.uppercase < next.titlecase)) 369 switch (dt.first.type)
370 {
371 case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
372 case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
373 case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
374 case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
375 }
376 } else if (dt.first.status == doublestatus::closing)
305 { 377 {
306 nextToken[0] = toupper(nextToken[0]); 378 for (int i=0; i<dt.second; i++)
379 {
380 while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
381 {
382 switch (open_delimiters.top())
383 {
384 case parentype::paren: nextToken.append(")"); break;
385 case parentype::square_bracket: nextToken.append("]"); break;
386 case parentype::asterisk: nextToken.append("*"); break;
387 case parentype::quote: nextToken.append("\""); break;
388 }
389
390 open_delimiters.pop();
391 }
392
393 if (open_delimiters.empty())
394 {
395 switch (dt.first.type)
396 {
397 case parentype::paren: result = "(" + result; break;
398 case parentype::square_bracket: result = "[" + result; break;
399 case parentype::asterisk: result = "*" + result; break;
400 case parentype::quote: result = "\"" + result; break;
401 }
402 }
403
404 switch (dt.first.type)
405 {
406 case parentype::paren: nextToken.append(")"); break;
407 case parentype::square_bracket: nextToken.append("]"); break;
408 case parentype::asterisk: nextToken.append("*"); break;
409 case parentype::quote: nextToken.append("\""); break;
410 }
411 }
307 } 412 }
308 } else if (next.word.type == tokentype_hashtag)
309 {
310 int rhash = rand() % hashtags.size();
311 nextToken = hashtags[rhash];
312 } 413 }
313 414
314 if (next.word.terminating) 415 // Terminators
416 if (next.tok.suffix == suffixtype::terminating)
315 { 417 {
316 std::map<int, termstats>& ending = endings[next.word]; 418 nextToken.append(next.tok.w.terms.next());
317 int emax = ending.rbegin()->first; 419 } else if (next.tok.suffix == suffixtype::comma)
318 int er = rand() % emax; 420 {
319 termstats& nextend = ending.upper_bound(er)->second; 421 nextToken.append(",");
320
321 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322 } 422 }
323 423
324 /* DEBUG */ 424 /* DEBUG */
325 printKgram(cur); 425 printKgram(cur);
426 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
427
428 cur.push_back(next.tok);
326 429
327 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; 430 result.append(nextToken + " ");
328 431
329 if (mess) 432 if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330 { 433 {
331 std::cout << " mala " << next.word.canon; 434 break;
332 } 435 }
333
334 std::cout << std::endl;
335
336 cur.push_back(next.word);
337
338 result.push_back(nextToken);
339 } 436 }
340
341 return result;
342}
343
344bool removeIf(char c)
345{
346 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
347}
348
349std::string canonize(std::string f)
350{
351 std::string canonical(f);
352 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353 437
354 std::string result; 438 // Remove the trailing space
355 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 439 if (result.back() == ' ')
440 {
441 result.pop_back();
442 }
443
444 // Close any open delimiters
445 while (!open_delimiters.empty())
446 {
447 switch (open_delimiters.top())
448 {
449 case parentype::paren: result.append(")"); break;
450 case parentype::square_bracket: result.append("]"); break;
451 case parentype::asterisk: result.append("*"); break;
452 case parentype::quote: result.append("\""); break;
453 }
454
455 open_delimiters.pop();
456 }
356 457
357 return result; 458 return result;
358} 459}
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2#include <map> 2#include <map>
3#include <list> 3#include <list>
4#include <vector> 4#include <vector>
5#include "malaprop.h" 5#include "histogram.h"
6 6
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10enum tokentype { 10struct word {
11 tokentype_literal, 11 std::string canon;
12 tokentype_hashtag 12 histogram<std::string> forms;
13 histogram<std::string> terms;
14
15 word(std::string canon) : canon(canon) {}
16
17 bool operator<(const word& other) const
18 {
19 return canon < other.canon;
20 }
13}; 21};
14 22
15struct token { 23extern word blank_word;
16 tokentype type; 24
17 std::string canon; 25enum class suffixtype {
18 bool terminating; 26 none,
27 terminating,
28 comma
29};
30
31enum class parentype {
32 paren,
33 square_bracket,
34 asterisk,
35 quote
36};
37
38enum class doublestatus {
39 opening,
40 closing,
41 both
42};
43
44struct delimiter {
45 parentype type;
46 doublestatus status;
47
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19 49
20 token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} 50 bool operator<(const delimiter& other) const
21 token(tokentype type) : type(type), canon(""), terminating(false) {} 51 {
52 return std::tie(type, status) < std::tie(other.type, other.status);
53 }
54};
55
56struct token {
57 const word& w;
58 std::map<delimiter, int> delimiters;
59 suffixtype suffix;
60 std::string raw;
61
62 token(const word& w) : w(w), suffix(suffixtype::none) {}
22 63
23 bool operator<(const token& other) const 64 bool operator<(const token& other) const
24 { 65 {
25 if (type != other.type) 66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26 {
27 return type < other.type;
28 } else if (type == tokentype_literal)
29 {
30 if (canon == other.canon)
31 {
32 return !terminating && other.terminating;
33 } else {
34 return canon < other.canon;
35 }
36 } else {
37 return !terminating && other.terminating;
38 }
39 } 67 }
40}; 68};
41 69
42enum querytype { 70enum class querytype {
43 querytype_literal, 71 literal,
44 querytype_sentence 72 sentence
45}; 73};
46 74
47struct query { 75struct query {
48 querytype type; 76 querytype type;
49 token word; 77 token tok;
50 78
51 query(token word) : word(word), type(querytype_literal) {} 79 query(token tok) : tok(tok), type(querytype::literal) {}
52 80
53 query(querytype type) : word(""), type(type) {} 81 query(querytype type) : tok(blank_word), type(type) {}
54 82
55 bool operator<(const query& other) const 83 bool operator<(const query& other) const
56 { 84 {
57 if (type == other.type) 85 if (type == other.type)
58 { 86 {
59 return word < other.word; 87 return tok < other.tok;
60 } else { 88 } else {
61 return type < other.type; 89 return type < other.type;
62 } 90 }
@@ -65,34 +93,11 @@ struct query {
65 93
66typedef std::list<query> kgram; 94typedef std::list<query> kgram;
67 95
68struct termstats {
69 char terminator;
70 int occurrences;
71
72 termstats() : terminator('.'), occurrences(1) {}
73
74 termstats(char terminator, int occurrences)
75 {
76 this->terminator = terminator;
77 this->occurrences = occurrences;
78 }
79
80 bool operator<(const termstats& other) const
81 {
82 if (terminator == other.terminator)
83 {
84 return occurrences < other.occurrences;
85 } else {
86 return terminator < other.terminator;
87 }
88 }
89};
90
91class kgramstats 96class kgramstats
92{ 97{
93public: 98public:
94 kgramstats(std::string corpus, int maxK); 99 kgramstats(std::string corpus, int maxK);
95 std::vector<std::string> randomSentence(int n); 100 std::string randomSentence(int n);
96 101
97private: 102private:
98 struct token_data 103 struct token_data
@@ -100,16 +105,15 @@ private:
100 int all; 105 int all;
101 int titlecase; 106 int titlecase;
102 int uppercase; 107 int uppercase;
103 token word; 108 token tok;
104 109
105 token_data() : word(""), all(0), titlecase(0), uppercase(0) {} 110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106 }; 111 };
107 112
108 int maxK; 113 int maxK;
109 std::map<kgram, std::map<int, token_data> > stats; 114 std::map<kgram, std::map<int, token_data> > stats;
110 malaprop mstats; 115 word hashtags {"#hashtag"};
111 std::map<token, std::map<int, termstats> > endings; 116 std::map<std::string, word> words;
112 std::vector<std::string> hashtags;
113}; 117};
114 118
115void printKgram(kgram k); 119void printKgram(kgram k);