about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:14:06 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:15:10 -0400
commit8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree0d9a8a12616d6ea335fdc687049b05f679e8ccc6
parenta9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
downloadrawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip
Pulled the ebooks functionality out into a library
-rw-r--r--CMakeLists.txt13
-rw-r--r--ebooks.cpp38
-rw-r--r--freevars.cpp32
-rw-r--r--freevars.h22
-rw-r--r--gen.cpp40
-rw-r--r--kgramstats.cpp465
-rw-r--r--kgramstats.h201
-rw-r--r--rawr.h6
8 files changed, 443 insertions, 374 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index ab1979f..a3f51af 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -12,12 +12,17 @@ include_directories(vendor/yaml-cpp/include)
12find_package(ASPELL REQUIRED) 12find_package(ASPELL REQUIRED)
13include_directories(${ASPELL_INCLUDE_DIR}) 13include_directories(${ASPELL_INCLUDE_DIR})
14 14
15add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) 15add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp)
16set_property(TARGET rawr PROPERTY CXX_STANDARD 11)
17set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON)
18target_link_libraries(rawr ${ASPELL_LIBRARIES})
19
20add_executable(rawr-ebooks ebooks.cpp)
16set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) 21set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
17set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) 22set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
18target_link_libraries(rawr-ebooks yaml-cpp twitter++ curlcpp curl ${ASPELL_LIBRARIES} pthread) 23target_link_libraries(rawr-ebooks rawr yaml-cpp twitter++ curlcpp curl pthread)
19 24
20add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) 25add_executable(rawr-gen gen.cpp)
21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) 26set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
22set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) 27set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
23target_link_libraries(rawr-gen ${ASPELL_LIBRARIES}) 28target_link_libraries(rawr-gen rawr)
diff --git a/ebooks.cpp b/ebooks.cpp index aa690c2..c01cdc9 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -39,9 +39,41 @@ int main(int argc, char** args)
39 39
40 corpus += line + "\n "; 40 corpus += line + "\n ";
41 } 41 }
42
43 // Replace old-style freevars while I can't be bothered to remake the corpus yet
44 std::vector<std::string> fv_names;
45 std::ifstream namefile("names.txt");
46 if (namefile.is_open())
47 {
48 while (!namefile.eof())
49 {
50 std::string l;
51 getline(namefile, l);
52 if (l.back() == '\r')
53 {
54 l.pop_back();
55 }
56
57 fv_names.push_back(l);
58 }
59 }
60
61 namefile.close();
42 62
43 std::cout << "Preprocessing corpus..." << std::endl; 63 std::cout << "Preprocessing corpus..." << std::endl;
44 kgramstats* stats = new kgramstats(corpus, 4); 64 rawr kgramstats;
65 kgramstats.addCorpus(corpus);
66 kgramstats.compile(4);
67 kgramstats.setTransformCallback([&] (std::string canonical, std::string) {
68 size_t pos = canonical.find("$name$");
69 if (pos != std::string::npos)
70 {
71 canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]);
72 }
73
74 return canonical;
75 });
76
45 std::mutex stats_mutex; 77 std::mutex stats_mutex;
46 78
47 client.setUserStreamNotifyCallback([&] (twitter::notification n) { 79 client.setUserStreamNotifyCallback([&] (twitter::notification n) {
@@ -60,7 +92,7 @@ int main(int argc, char** args)
60 std::string doc = "@" + n.getTweet().getAuthor().getScreenName() + " "; 92 std::string doc = "@" + n.getTweet().getAuthor().getScreenName() + " ";
61 { 93 {
62 std::lock_guard<std::mutex> stats_lock(stats_mutex); 94 std::lock_guard<std::mutex> stats_lock(stats_mutex);
63 doc += stats->randomSentence(140 - doc.length()); 95 doc += kgramstats.randomSentence(140 - doc.length());
64 doc.resize(140); 96 doc.resize(140);
65 } 97 }
66 98
@@ -84,7 +116,7 @@ int main(int argc, char** args)
84 std::string doc; 116 std::string doc;
85 { 117 {
86 std::lock_guard<std::mutex> stats_lock(stats_mutex); 118 std::lock_guard<std::mutex> stats_lock(stats_mutex);
87 doc = stats->randomSentence(140); 119 doc = kgramstats.randomSentence(140);
88 } 120 }
89 doc.resize(140); 121 doc.resize(140);
90 122
diff --git a/freevars.cpp b/freevars.cpp deleted file mode 100644 index 4429d00..0000000 --- a/freevars.cpp +++ /dev/null
@@ -1,32 +0,0 @@
1#include "freevars.h"
2#include <fstream>
3#include "kgramstats.h"
4
5freevar::freevar(word& w, std::string file) : w(w)
6{
7 std::ifstream infile(file);
8 if (infile)
9 {
10 std::string line;
11 while (getline(infile, line))
12 {
13 instances.insert(line);
14 w.forms.add(line);
15 }
16 }
17}
18
19bool freevar::check(std::string f) const
20{
21 return (instances.count(f) == 1);
22}
23
24void freevar::add(std::string f)
25{
26 instances.insert(f);
27}
28
29word& freevar::getWord()
30{
31 return w;
32}
diff --git a/freevars.h b/freevars.h deleted file mode 100644 index f800220..0000000 --- a/freevars.h +++ /dev/null
@@ -1,22 +0,0 @@
1#include <string>
2#include <set>
3
4#ifndef FREEVARS_H
5#define FREEVARS_H
6
7class word;
8
9class freevar
10{
11 public:
12 freevar(word& w, std::string file);
13 bool check(std::string f) const;
14 void add(std::string f);
15 word& getWord();
16
17 private:
18 word& w;
19 std::set<std::string> instances;
20};
21
22#endif \ No newline at end of file
diff --git a/gen.cpp b/gen.cpp index 0319283..eba0277 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -44,18 +44,48 @@ int main(int argc, char** args)
44 44
45 corpus += line + "\n "; 45 corpus += line + "\n ";
46 } 46 }
47
48 // Replace old-style freevars while I can't be bothered to remake the corpus yet
49 std::vector<std::string> fv_names;
50 std::ifstream namefile("names.txt");
51 if (namefile.is_open())
52 {
53 while (!namefile.eof())
54 {
55 std::string l;
56 getline(namefile, l);
57 if (l.back() == '\r')
58 {
59 l.pop_back();
60 }
61
62 fv_names.push_back(l);
63 }
64 }
65
66 namefile.close();
47 67
48 std::cout << "Preprocessing corpus..." << std::endl; 68 std::cout << "Preprocessing corpus..." << std::endl;
49 kgramstats* stats = new kgramstats(corpus, 4); 69 rawr kgramstats;
70 kgramstats.addCorpus(corpus);
71 kgramstats.compile(4);
72 kgramstats.setTransformCallback([&] (std::string canonical, std::string) {
73 size_t pos = canonical.find("$name$");
74 if (pos != std::string::npos)
75 {
76 canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]);
77 }
78
79 return canonical;
80 });
50 81
51 std::cout << "Generating..." << std::endl; 82 std::cout << "Generating..." << std::endl;
52 for (;;) 83 for (;;)
53 { 84 {
54 std::string doc = stats->randomSentence(140); 85 std::string doc = kgramstats.randomSentence(140);
55 std::string hi = doc; 86 doc.resize(140);
56 hi.resize(140);
57 87
58 std::cout << hi << std::endl; 88 std::cout << doc << std::endl;
59 89
60 getc(stdin); 90 getc(stdin);
61 } 91 }
diff --git a/kgramstats.cpp b/kgramstats.cpp index a44bf2b..47f3bc0 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -33,32 +33,47 @@
33// 33//
34 34
35#include "kgramstats.h" 35#include "kgramstats.h"
36#include <vector>
37#include <iostream> 36#include <iostream>
38#include <cstdlib>
39#include <cstring> 37#include <cstring>
40#include <algorithm> 38#include <algorithm>
41#include <set> 39#include <set>
42#include <stack> 40#include <stack>
43#include "freevars.h"
44#include <fstream>
45#include "prefix_search.h" 41#include "prefix_search.h"
46#include <aspell.h> 42#include <aspell.h>
43#include <fstream>
44
45const rawr::query rawr::wildcardQuery = {querytype::sentence};
46const rawr::word rawr::blank_word = {""};
47 47
48query wildcardQuery {querytype::sentence}; 48void rawr::addCorpus(std::string corpus)
49word blank_word {""}; 49{
50 _corpora.push_back(corpus);
51}
50 52
51// runs in O(t^2) time where t is the number of tokens in the input corpus 53// runs in O(t^2) time where t is the number of tokens in the input corpus
52// We consider maxK to be fairly constant 54// We consider maxK to be fairly constant
53kgramstats::kgramstats(std::string corpus, int maxK) 55void rawr::compile(int maxK)
54{ 56{
55 this->maxK = maxK; 57 _maxK = maxK;
56 58
57 std::vector<token> tokens; 59 std::vector<token> tokens;
58 size_t start = 0; 60 size_t start = 0;
59 int end = 0;
60 std::set<std::string> thashtags; 61 std::set<std::string> thashtags;
61 freevar fv_emoticons {emoticons, "emoticons.txt"}; 62 std::set<std::string> fv_emoticons;
63
64 std::ifstream fvefile("emoticons.txt");
65 if (fvefile)
66 {
67 std::string line;
68 while (getline(fvefile, line))
69 {
70 fv_emoticons.insert(line);
71 emoticons.forms.add(line);
72 }
73 }
74
75 fvefile.close();
76
62 std::map<std::string, std::string> canonical_form; 77 std::map<std::string, std::string> canonical_form;
63 78
64 AspellConfig* spell_config = new_aspell_config(); 79 AspellConfig* spell_config = new_aspell_config();
@@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK)
92 } 107 }
93 108
94 std::cout << "Tokenizing corpus... 0%" << std::flush; 109 std::cout << "Tokenizing corpus... 0%" << std::flush;
95 int len = corpus.length(); 110 int len = 0;
111 for (auto c : _corpora)
112 {
113 len += c.length();
114 }
115
116 int startper = 0;
96 int per = 0; 117 int per = 0;
97 int perprime = 0; 118 int perprime = 0;
98 std::cout.fill(' '); 119 std::cout.fill(' ');
99 while (end != std::string::npos) 120 for (int i = 0; i < _corpora.size(); i++)
100 { 121 {
101 perprime = end * 100 / len; 122 int end = 0;
102 if (perprime != per) 123
124 while (end != std::string::npos)
103 { 125 {
104 per = perprime; 126 perprime = (startper + end) * 100 / len;
127 if (perprime != per)
128 {
129 per = perprime;
105 130
106 std::cout << "\b\b\b\b" << std::right; 131 std::cout << "\b\b\b\b" << std::right;
107 std::cout.width(3); 132 std::cout.width(3);
108 std::cout << per << "%" << std::flush; 133 std::cout << per << "%" << std::flush;
109 } 134 }
110 135
111 end = corpus.find(" ", start); 136 end = _corpora[i].find(" ", start);
112 137
113 bool emoji = false; 138 bool emoji = false;
114 std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 139 std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start);
115 std::string t = ""; 140 std::string t = "";
116 141
117 if (te.compare("") && te.compare(".")) 142 if (te.compare("") && te.compare("."))
118 {
119 // Extract strings of emojis into their own tokens even if they're not space delimited
120 int m = emojis.match(te);
121 emoji = m > 0;
122 if (m == 0) m = 1;
123 t = te.substr(0,m);
124 te = te.substr(m);
125
126 while (!te.empty())
127 { 143 {
128 m = emojis.match(te); 144 // Extract strings of emojis into their own tokens even if they're not space delimited
129 if (emoji == (m > 0)) 145 int m = emojis.match(te);
146 emoji = m > 0;
147 if (m == 0) m = 1;
148 t = te.substr(0,m);
149 te = te.substr(m);
150
151 while (!te.empty())
130 { 152 {
131 if (m == 0) m = 1; 153 m = emojis.match(te);
132 t += te.substr(0,m); 154 if (emoji == (m > 0))
133 te = te.substr(m); 155 {
134 } else { 156 if (m == 0) m = 1;
135 end = start + t.length() - 1; 157 t += te.substr(0,m);
136 break; 158 te = te.substr(m);
159 } else {
160 end = start + t.length() - 1;
161 break;
162 }
137 } 163 }
138 }
139 164
140 std::string tc(t); 165 std::string tc(t);
141 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 166 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
142 167
143 int pst = tc.find_first_not_of("\"([*"); 168 int pst = tc.find_first_not_of("\"([*");
144 int dst = tc.find_last_not_of("\")]*.,?!\n"); 169 int dst = tc.find_last_not_of("\")]*.,?!\n");
145 std::string canonical(""); 170 std::string canonical("");
146 if ((pst != std::string::npos) && (dst != std::string::npos)) 171 if ((pst != std::string::npos) && (dst != std::string::npos))
147 {
148 canonical = std::string(tc, pst, dst - pst + 1);
149 }
150
151 word& w = ([&] () -> word& {
152 // Hashtag freevar
153 if (canonical[0] == '#')
154 { 172 {
155 thashtags.insert(canonical); 173 canonical = std::string(tc, pst, dst - pst + 1);
156
157 return hashtags;
158 } 174 }
159 175
160 // Emoticon freevar 176 word& w = ([&] () -> word& {
161 if (emoji) 177 // Hashtag freevar
162 { 178 if (canonical[0] == '#')
163 emoticons.forms.add(canonical); 179 {
180 thashtags.insert(canonical);
164 181
165 return emoticons; 182 return hashtags;
166 } 183 }
167 184
168 if ((pst != std::string::npos) && (dst != std::string::npos)) 185 // Emoticon freevar
169 { 186 if (emoji)
170 std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
171 if (fv_emoticons.check(emoticon_canon))
172 { 187 {
173 emoticons.forms.add(emoticon_canon); 188 emoticons.forms.add(canonical);
174 189
175 return emoticons; 190 return emoticons;
176 } 191 }
177 }
178 192
179 // Basically any other word 193 if ((pst != std::string::npos) && (dst != std::string::npos))
180 if (canonical_form.count(canonical) == 0)
181 {
182 if (
183 // Legacy freevars should be distinct from tokens containing similar words
184 (canonical.find("$name$") != std::string::npos)
185 // Words with no letters will be mangled by the spell checker
186 || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
187 )
188 { 194 {
189 canonical_form[canonical] = canonical; 195 std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
190 words.emplace(canonical, canonical); 196 if (fv_emoticons.count(emoticon_canon) == 1)
191 } else { 197 {
192 int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); 198 emoticons.forms.add(emoticon_canon);
193 if (correct) 199
200 return emoticons;
201 }
202 }
203
204 // Basically any other word
205 if (canonical_form.count(canonical) == 0)
206 {
207 if (
208 // Legacy freevars should be distinct from tokens containing similar words
209 (canonical.find("$name$") != std::string::npos)
210 // Words with no letters will be mangled by the spell checker
211 || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
212 )
194 { 213 {
195 words.emplace(canonical, canonical);
196 canonical_form[canonical] = canonical; 214 canonical_form[canonical] = canonical;
215 words.emplace(canonical, canonical);
197 } else { 216 } else {
198 const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); 217 int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
199 AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); 218 if (correct)
200 const char* replacement = aspell_string_enumeration_next(elements);
201 if (replacement != NULL)
202 { 219 {
203 std::string sugrep(replacement);
204 canonical_form[canonical] = sugrep;
205
206 if (words.count(sugrep) == 0)
207 {
208 words.emplace(sugrep, sugrep);
209 }
210 } else {
211 words.emplace(canonical, canonical); 220 words.emplace(canonical, canonical);
212 canonical_form[canonical] = canonical; 221 canonical_form[canonical] = canonical;
213 } 222 } else {
223 const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
224 AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
225 const char* replacement = aspell_string_enumeration_next(elements);
226 if (replacement != NULL)
227 {
228 std::string sugrep(replacement);
229 canonical_form[canonical] = sugrep;
214 230
215 delete_aspell_string_enumeration(elements); 231 if (words.count(sugrep) == 0)
232 {
233 words.emplace(sugrep, sugrep);
234 }
235 } else {
236 words.emplace(canonical, canonical);
237 canonical_form[canonical] = canonical;
238 }
239
240 delete_aspell_string_enumeration(elements);
241 }
216 } 242 }
217 } 243 }
218 }
219 244
220 word& tw = words.at(canonical_form.at(canonical)); 245 word& tw = words.at(canonical_form.at(canonical));
221 tw.forms.add(canonical); 246 tw.forms.add(canonical);
222 247
223 return tw; 248 return tw;
224 })(); 249 })();
225 250
226 token tk(w); 251 token tk(w);
227 tk.raw = t; 252 tk.raw = t;
228 253
229 for (char c : t) 254 for (char c : t)
230 {
231 if (c == '*')
232 { 255 {
233 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++; 256 if (c == '*')
234 } else if (c == '[') 257 {
235 { 258 tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
236 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++; 259 } else if (c == '[')
237 } else if (c == '(') 260 {
238 { 261 tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
239 tk.delimiters[{parentype::paren, doublestatus::opening}]++; 262 } else if (c == '(')
240 } else if (c == '"') 263 {
241 { 264 tk.delimiters[{parentype::paren, doublestatus::opening}]++;
242 tk.delimiters[{parentype::quote, doublestatus::opening}]++; 265 } else if (c == '"')
243 } else { 266 {
244 break; 267 tk.delimiters[{parentype::quote, doublestatus::opening}]++;
268 } else {
269 break;
270 }
245 } 271 }
246 }
247 272
248 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1; 273 int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
249 if (backtrack != t.length()) 274 if (backtrack != t.length())
250 {
251 std::string ending = t.substr(backtrack);
252 std::string suffix;
253
254 for (char c : ending)
255 { 275 {
256 if ((c == '.') || (c == ',') || (c == '?') || (c == '!')) 276 std::string ending = t.substr(backtrack);
277 std::string suffix;
278
279 for (char c : ending)
257 { 280 {
258 suffix += c; 281 if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
282 {
283 suffix += c;
259 284
260 continue; 285 continue;
261 } else if (c == '\n') 286 } else if (c == '\n')
262 {
263 // At least the end is coming
264 if (suffix.empty())
265 { 287 {
266 suffix = "."; 288 // At least the end is coming
267 } 289 if (suffix.empty())
290 {
291 suffix = ".";
292 }
268 293
269 break; 294 break;
270 } 295 }
296
297 parentype pt = ([&] {
298 switch (c)
299 {
300 case ']': return parentype::square_bracket;
301 case ')': return parentype::paren;
302 case '*': return parentype::asterisk;
303 case '"': return parentype::quote;
304 }
305 })();
271 306
272 parentype pt = ([&] { 307 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
273 switch (c)
274 { 308 {
275 case ']': return parentype::square_bracket; 309 tk.delimiters[{pt, doublestatus::opening}]--;
276 case ')': return parentype::paren; 310 tk.delimiters[{pt, doublestatus::both}]++;
277 case '*': return parentype::asterisk; 311 } else {
278 case '"': return parentype::quote; 312 tk.delimiters[{pt, doublestatus::closing}]++;
279 } 313 }
280 })();
281
282 if (tk.delimiters[{pt, doublestatus::opening}] > 0)
283 {
284 tk.delimiters[{pt, doublestatus::opening}]--;
285 tk.delimiters[{pt, doublestatus::both}]++;
286 } else {
287 tk.delimiters[{pt, doublestatus::closing}]++;
288 } 314 }
289 }
290 315
291 if (suffix == ",") 316 if (suffix == ",")
292 { 317 {
293 tk.suffix = suffixtype::comma; 318 tk.suffix = suffixtype::comma;
294 } else if (!suffix.empty()) { 319 } else if (!suffix.empty()) {
295 tk.suffix = suffixtype::terminating; 320 tk.suffix = suffixtype::terminating;
296 321
297 w.terms.add(suffix); 322 w.terms.add(suffix);
323 }
298 } 324 }
299 }
300 325
301 tokens.push_back(tk); 326 tokens.push_back(tk);
302 } 327 }
303 328
304 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 329 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
330 }
331
332 startper += _corpora[i].length();
305 } 333 }
306 334
307 std::cout << "\b\b\b\b100%" << std::endl; 335 std::cout << "\b\b\b\b100%" << std::endl;
@@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
420 448
421 kgram klist = it.first; 449 kgram klist = it.first;
422 auto& probtable = it.second; 450 auto& probtable = it.second;
423 auto& distribution = stats[klist]; 451 auto& distribution = _stats[klist];
424 int max = 0; 452 int max = 0;
425 453
426 for (auto& kt : probtable) 454 for (auto& kt : probtable)
@@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK)
432 } 460 }
433 461
434 std::cout << "\b\b\b\b100%" << std::endl; 462 std::cout << "\b\b\b\b100%" << std::endl;
463
464 _compiled = true;
435} 465}
436 466
437void printKgram(kgram k) 467std::ostream& operator<<(std::ostream& os, rawr::kgram k)
438{ 468{
439 for (auto& q : k) 469 for (auto& q : k)
440 { 470 {
441 if (q.type == querytype::sentence) 471 os << q << " ";
442 {
443 std::cout << "#.# ";
444 } else if (q.type == querytype::literal)
445 {
446 if (q.tok.suffix == suffixtype::terminating)
447 {
448 std::cout << q.tok.w.canon << ". ";
449 } else if (q.tok.suffix == suffixtype::comma)
450 {
451 std::cout << q.tok.w.canon << ", ";
452 } else {
453 std::cout << q.tok.w.canon << " ";
454 }
455 }
456 } 472 }
473
474 return os;
475}
476
477std::ostream& operator<<(std::ostream& os, rawr::query q)
478{
479 if (q.type == rawr::querytype::sentence)
480 {
481 return os << "#.#";
482 } else if (q.type == rawr::querytype::literal)
483 {
484 return os << q.tok;
485 }
486
487 return os;
488}
489
490std::ostream& operator<<(std::ostream& os, rawr::token t)
491{
492 os << t.w.canon;
493
494 if (t.suffix == rawr::suffixtype::terminating)
495 {
496 return os << ".";
497 } else if (t.suffix == rawr::suffixtype::comma)
498 {
499 return os << ",";
500 } else {
501 return os;
502 }
503}
504
505void rawr::setTransformCallback(transform_callback _arg)
506{
507 _transform = _arg;
457} 508}
458 509
459// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 510// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
460std::string kgramstats::randomSentence(int maxL) 511std::string rawr::randomSentence(int maxL)
461{ 512{
513 if (!_compiled)
514 {
515 return "";
516 }
517
462 std::string result; 518 std::string result;
463 kgram cur(1, wildcardQuery); 519 kgram cur(1, wildcardQuery);
464 int cuts = 0; 520 int cuts = 0;
@@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL)
466 522
467 for (;;) 523 for (;;)
468 { 524 {
469 if (cur.size() == maxK) 525 if (cur.size() == _maxK)
470 { 526 {
471 cur.pop_front(); 527 cur.pop_front();
472 } 528 }
473 529
474 if (cur.size() > 0) 530 if (cur.size() > 0)
475 { 531 {
476 if (rand() % (maxK - cur.size() + 1) == 0) 532 if (rand() % (_maxK - cur.size() + 1) == 0)
477 { 533 {
478 while ((cur.size() > 2) && (cuts > 0)) 534 while ((cur.size() > 2) && (cuts > 0))
479 { 535 {
@@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL)
490 546
491 // Gotta circumvent the last line of the input corpus 547 // Gotta circumvent the last line of the input corpus
492 // https://twitter.com/starla4444/status/684222271339237376 548 // https://twitter.com/starla4444/status/684222271339237376
493 if (stats.count(cur) == 0) 549 if (_stats.count(cur) == 0)
494 { 550 {
495 cur = kgram(1, wildcardQuery); 551 cur = kgram(1, wildcardQuery);
496 } 552 }
497 553
498 auto& distribution = stats[cur]; 554 auto& distribution = _stats[cur];
499 int max = distribution.rbegin()->first; 555 int max = distribution.rbegin()->first;
500 int r = rand() % max; 556 int r = rand() % max;
501 token_data& next = distribution.upper_bound(r)->second; 557 token_data& next = distribution.upper_bound(r)->second;
502 std::string nextToken = next.tok.w.forms.next(); 558 std::string nextToken = next.tok.w.forms.next();
559
560 // Apply user-specified transforms
561 if (_transform)
562 {
563 nextToken = _transform(next.tok.w.canon, nextToken);
564 }
503 565
504 // Determine the casing of the next token. We randomly make the token all 566 // Determine the casing of the next token. We randomly make the token all
505 // caps based on the markov chain. Otherwise, we check if the previous 567 // caps based on the markov chain. Otherwise, we check if the previous
@@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL)
600 } 662 }
601 663
602 /* DEBUG */ 664 /* DEBUG */
603 printKgram(cur); 665 std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
604 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
605 666
606 cur.push_back(next.tok); 667 cur.push_back(next.tok);
607 668
@@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL)
633 open_delimiters.pop(); 694 open_delimiters.pop();
634 } 695 }
635 696
636 // Replace old-style freevars while I can't be bothered to remake the corpus yet 697 result.resize(maxL);
637 std::vector<std::string> fv_names;
638 std::ifstream namefile("names.txt");
639 if (namefile.is_open())
640 {
641 while (!namefile.eof())
642 {
643 std::string l;
644 getline(namefile, l);
645 if (l.back() == '\r')
646 {
647 l.pop_back();
648 }
649
650 fv_names.push_back(l);
651 }
652
653 int cpos;
654 while ((cpos = result.find("$name$")) != std::string::npos)
655 {
656 result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
657 }
658 }
659 698
660 return result; 699 return result;
661} 700}
diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -1,124 +1,135 @@
1#ifndef KGRAMSTATS_H
2#define KGRAMSTATS_H
3
1#include <string> 4#include <string>
2#include <map> 5#include <map>
3#include <list> 6#include <list>
4#include <vector> 7#include <vector>
5#include "histogram.h" 8#include "histogram.h"
9#include <functional>
6 10
7#ifndef KGRAMSTATS_H 11class rawr {
8#define KGRAMSTATS_H 12 public:
9 13 typedef std::function<std::string(std::string, std::string)> transform_callback;
10struct word { 14
11 std::string canon; 15 void addCorpus(std::string corpus);
12 histogram<std::string> forms; 16 void compile(int maxK);
13 histogram<std::string> terms; 17
18 void setTransformCallback(transform_callback _arg);
19 std::string randomSentence(int maxL);
20
21 private:
22 struct word {
23 std::string canon;
24 histogram<std::string> forms;
25 histogram<std::string> terms;
14 26
15 word(std::string canon) : canon(canon) {} 27 word(std::string canon) : canon(canon) {}
16 28
17 bool operator<(const word& other) const 29 bool operator<(const word& other) const
18 { 30 {
19 return canon < other.canon; 31 return canon < other.canon;
20 } 32 }
21}; 33 };
22
23extern word blank_word;
24 34
25enum class suffixtype { 35 enum class suffixtype {
26 none, 36 none,
27 terminating, 37 terminating,
28 comma 38 comma
29}; 39 };
30 40
31enum class parentype { 41 enum class parentype {
32 paren, 42 paren,
33 square_bracket, 43 square_bracket,
34 asterisk, 44 asterisk,
35 quote 45 quote
36}; 46 };
37 47
38enum class doublestatus { 48 enum class doublestatus {
39 opening, 49 opening,
40 closing, 50 closing,
41 both 51 both
42}; 52 };
43 53
44struct delimiter { 54 struct delimiter {
45 parentype type; 55 parentype type;
46 doublestatus status; 56 doublestatus status;
47 57
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {} 58 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
49 59
50 bool operator<(const delimiter& other) const 60 bool operator<(const delimiter& other) const
51 { 61 {
52 return std::tie(type, status) < std::tie(other.type, other.status); 62 return std::tie(type, status) < std::tie(other.type, other.status);
53 } 63 }
54}; 64 };
55 65
56struct token { 66 struct token {
57 const word& w; 67 const word& w;
58 std::map<delimiter, int> delimiters; 68 std::map<delimiter, int> delimiters;
59 suffixtype suffix; 69 suffixtype suffix;
60 std::string raw; 70 std::string raw;
61 71
62 token(const word& w) : w(w), suffix(suffixtype::none) {} 72 token(const word& w) : w(w), suffix(suffixtype::none) {}
63 73
64 bool operator<(const token& other) const 74 bool operator<(const token& other) const
65 { 75 {
66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); 76 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
67 } 77 }
68}; 78 };
69 79
70enum class querytype { 80 enum class querytype {
71 literal, 81 literal,
72 sentence 82 sentence
73}; 83 };
74 84
75struct query { 85 struct query {
76 querytype type; 86 querytype type;
77 token tok; 87 token tok;
78 88
79 query(token tok) : tok(tok), type(querytype::literal) {} 89 query(token tok) : tok(tok), type(querytype::literal) {}
80 90
81 query(querytype type) : tok(blank_word), type(type) {} 91 query(querytype type) : tok(blank_word), type(type) {}
82 92
83 bool operator<(const query& other) const 93 bool operator<(const query& other) const
84 { 94 {
85 if (type == other.type) 95 if (type == other.type)
86 { 96 {
87 return tok < other.tok; 97 return tok < other.tok;
88 } else { 98 } else {
89 return type < other.type; 99 return type < other.type;
90 } 100 }
91 } 101 }
92}; 102 };
93 103
94typedef std::list<query> kgram; 104 static const query wildcardQuery;
105 static const word blank_word;
95 106
96class kgramstats 107 typedef std::list<query> kgram;
97{
98public:
99 kgramstats(std::string corpus, int maxK);
100 std::string randomSentence(int maxL);
101
102private:
103 struct token_data
104 {
105 int all;
106 int titlecase;
107 int uppercase;
108 token tok;
109 108
110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 109 struct token_data
111 }; 110 {
111 int all;
112 int titlecase;
113 int uppercase;
114 token tok;
115
116 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
117 };
118
119 friend std::ostream& operator<<(std::ostream& os, kgram k);
120 friend std::ostream& operator<<(std::ostream& os, query q);
121 friend std::ostream& operator<<(std::ostream& os, token t);
112 122
113 int maxK; 123 int _maxK;
114 std::map<kgram, std::map<int, token_data> > stats; 124 bool _compiled = false;
125 std::vector<std::string> _corpora;
126 std::map<kgram, std::map<int, token_data>> _stats;
127 transform_callback _transform;
115 128
116 // Words 129 // Words
117 std::map<std::string, word> words; 130 std::map<std::string, word> words;
118 word hashtags {"#hashtag"}; 131 word hashtags {"#hashtag"};
119 word emoticons {"👌"}; 132 word emoticons {"👌"};
120}; 133};
121 134
122void printKgram(kgram k);
123
124#endif \ No newline at end of file 135#endif \ No newline at end of file
diff --git a/rawr.h b/rawr.h new file mode 100644 index 0000000..2b5daf7 --- /dev/null +++ b/rawr.h
@@ -0,0 +1,6 @@
1#ifndef RAWR_H_E903544C
2#define RAWR_H_E903544C
3
4#include "kgramstats.h"
5
6#endif /* end of include guard: RAWR_H_E903544C */