diff options
-rw-r--r-- | Makefile.am | 3 | ||||
-rw-r--r-- | kgramstats.cpp | 187 | ||||
-rw-r--r-- | kgramstats.h | 5 |
3 files changed, 173 insertions, 22 deletions
diff --git a/Makefile.am b/Makefile.am index 299dc10..150ede2 100644 --- a/Makefile.am +++ b/Makefile.am | |||
@@ -4,5 +4,6 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} | |||
4 | bin_PROGRAMS = rawr-ebooks rawr-gen | 4 | bin_PROGRAMS = rawr-ebooks rawr-gen |
5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp | 5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp |
6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp | 6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp |
7 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) | 7 | rawr_gen_CPPFLAGS = -std=c++11 |
8 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -std=c++11 | ||
8 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file | 9 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file |
diff --git a/kgramstats.cpp b/kgramstats.cpp index 2321b11..1f3dd3c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -28,6 +28,8 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
28 | } | 28 | } |
29 | 29 | ||
30 | map<kgram, map<string, token_data*>* > tstats; | 30 | map<kgram, map<string, token_data*>* > tstats; |
31 | bool newSentence = true; | ||
32 | bool newClause = false; | ||
31 | for (int k=0; k<=maxK; k++) | 33 | for (int k=0; k<=maxK; k++) |
32 | { | 34 | { |
33 | for (int i=0; i<(tokens.size() - k); i++) | 35 | for (int i=0; i<(tokens.size() - k); i++) |
@@ -50,11 +52,83 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
50 | token_data* td = tstats[seq]->at(canonical); | 52 | token_data* td = tstats[seq]->at(canonical); |
51 | td->token = new string(canonical); | 53 | td->token = new string(canonical); |
52 | td->all++; | 54 | td->all++; |
55 | |||
56 | if (newSentence) | ||
57 | { | ||
58 | kgram newKgram(1, "."); | ||
59 | if (tstats[newKgram] == NULL) | ||
60 | { | ||
61 | tstats[newKgram] = new map<string, token_data*>(); | ||
62 | } | ||
63 | |||
64 | (*tstats[newKgram])[canonical] = td; | ||
65 | |||
66 | newSentence = false; | ||
67 | } | ||
68 | |||
69 | if (newClause) | ||
70 | { | ||
71 | kgram commaKgram(1, ","); | ||
72 | if (tstats[commaKgram] == NULL) | ||
73 | { | ||
74 | tstats[commaKgram] = new map<string, token_data*>(); | ||
75 | } | ||
76 | |||
77 | (*tstats[commaKgram])[canonical] = td; | ||
78 | |||
79 | newClause = false; | ||
80 | } | ||
53 | 81 | ||
54 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | 82 | if ((f.length() > 0) && (f[f.length()-1] == '.')) |
55 | { | 83 | { |
56 | td->period++; | 84 | td->period++; |
85 | newSentence = true; | ||
57 | } | 86 | } |
87 | |||
88 | if (f.length() > 0) | ||
89 | { | ||
90 | if (f[0] == '"') | ||
91 | { | ||
92 | td->startquote++; | ||
93 | } | ||
94 | |||
95 | if (f[f.length()-1] == '"') | ||
96 | { | ||
97 | td->endquote++; | ||
98 | |||
99 | if ((f.length() > 1) && (f[f.length()-2] == ',')) | ||
100 | { | ||
101 | td->comma++; | ||
102 | newClause = true; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | if (f[f.length()-1] == ',') | ||
107 | { | ||
108 | td->comma++; | ||
109 | newClause = true; | ||
110 | |||
111 | if ((f.length() > 1) && (f[f.length()-2] == '"')) | ||
112 | { | ||
113 | td->endquote++; | ||
114 | } | ||
115 | |||
116 | if ((f.length() > 1) && (f[f.length()-2] == ')')) | ||
117 | { | ||
118 | td->endparen++; | ||
119 | } | ||
120 | } | ||
121 | |||
122 | if (f[0] == '(') | ||
123 | { | ||
124 | td->startparen++; | ||
125 | } | ||
126 | |||
127 | if (f[f.length()-1] == ')') | ||
128 | { | ||
129 | td->endparen++; | ||
130 | } | ||
131 | } | ||
58 | 132 | ||
59 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 133 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) |
60 | { | 134 | { |
@@ -97,22 +171,31 @@ void printKgram(kgram k) | |||
97 | vector<string> kgramstats::randomSentence(int n) | 171 | vector<string> kgramstats::randomSentence(int n) |
98 | { | 172 | { |
99 | vector<string> result; | 173 | vector<string> result; |
100 | list<string> cur; | 174 | kgram newKgram(1, "."); |
175 | kgram commaKgram(1, ","); | ||
176 | list<string> cur = newKgram; | ||
177 | int cuts = 0; | ||
101 | 178 | ||
102 | for (int i=0; i<n; i++) | 179 | for (int i=0; i<n; i++) |
103 | { | 180 | { |
104 | if ((rand() % (maxK - cur.size() + 1)) == 0) | 181 | if ((cur.size() > 0) && (cur != newKgram)) |
105 | { | 182 | { |
106 | for (int i=0; i<cur.size(); i++) | 183 | if (rand() % (maxK - cur.size() + 1) == 0) |
107 | { | 184 | { |
108 | if ((rand() % 3) == 0) | 185 | while (cur.size() > 0) |
109 | { | 186 | { |
110 | cur.pop_front(); | 187 | if ((rand() % (n)) < cuts) |
111 | } else { | 188 | { |
112 | break; | 189 | cur.pop_front(); |
113 | } | 190 | cuts--; |
114 | } | 191 | } else { |
115 | } | 192 | break; |
193 | } | ||
194 | } | ||
195 | } | ||
196 | |||
197 | cuts++; | ||
198 | } | ||
116 | 199 | ||
117 | map<int, token_data*> distribution = *(*stats)[cur]; | 200 | map<int, token_data*> distribution = *(*stats)[cur]; |
118 | int max = distribution.rbegin()->first; | 201 | int max = distribution.rbegin()->first; |
@@ -122,6 +205,11 @@ vector<string> kgramstats::randomSentence(int n) | |||
122 | string nextToken(*(next->token)); | 205 | string nextToken(*(next->token)); |
123 | int casing = rand() % next->all; | 206 | int casing = rand() % next->all; |
124 | int period = rand() % next->all; | 207 | int period = rand() % next->all; |
208 | int startparen = rand() % next->all; | ||
209 | int endparen = rand() % next->all; | ||
210 | int startquote = rand() % next->all; | ||
211 | int endquote = rand() % next->all; | ||
212 | int comma = rand() % next->all; | ||
125 | if (casing < next->uppercase) | 213 | if (casing < next->uppercase) |
126 | { | 214 | { |
127 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 215 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
@@ -129,11 +217,53 @@ vector<string> kgramstats::randomSentence(int n) | |||
129 | { | 217 | { |
130 | nextToken[0] = toupper(nextToken[0]); | 218 | nextToken[0] = toupper(nextToken[0]); |
131 | } | 219 | } |
220 | |||
221 | if ((cur == newKgram) && (rand() % 3 < 2)) | ||
222 | { | ||
223 | nextToken[0] = toupper(nextToken[0]); | ||
224 | } | ||
225 | |||
226 | if (startquote < next->startquote) | ||
227 | { | ||
228 | nextToken = "\"" + nextToken; | ||
229 | } else if (startparen < next->startparen) | ||
230 | { | ||
231 | nextToken = "(" + nextToken; | ||
232 | } | ||
132 | 233 | ||
133 | if (period < next->period) | 234 | if (period < next->period) |
134 | { | 235 | { |
135 | nextToken += "."; | 236 | if (endquote < next->endquote) |
136 | } | 237 | { |
238 | nextToken += "\""; | ||
239 | } else if (endparen < next->endparen) | ||
240 | { | ||
241 | nextToken += ")"; | ||
242 | } | ||
243 | |||
244 | int type = rand() % 6; | ||
245 | |||
246 | if (type < 3) | ||
247 | { | ||
248 | nextToken += "."; | ||
249 | } else if (type < 5) | ||
250 | { | ||
251 | nextToken += "!"; | ||
252 | } else { | ||
253 | nextToken += "?"; | ||
254 | } | ||
255 | } else if (comma < next->comma) | ||
256 | { | ||
257 | if (endquote < next->endquote) | ||
258 | { | ||
259 | nextToken += "\""; | ||
260 | } else if (endparen < next->endparen) | ||
261 | { | ||
262 | nextToken += ")"; | ||
263 | } | ||
264 | |||
265 | nextToken += ","; | ||
266 | } | ||
137 | 267 | ||
138 | if (cur.size() == maxK) | 268 | if (cur.size() == maxK) |
139 | { | 269 | { |
@@ -147,8 +277,22 @@ vector<string> kgramstats::randomSentence(int n) | |||
147 | } | 277 | } |
148 | 278 | ||
149 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; | 279 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; |
280 | |||
281 | if ((cur == newKgram) || (cur == commaKgram)) | ||
282 | { | ||
283 | cur.pop_front(); | ||
284 | } | ||
285 | |||
286 | if ((period < next->period) && ((rand() % 2) == 0)) | ||
287 | { | ||
288 | cur = newKgram; | ||
289 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | ||
290 | { | ||
291 | cur = commaKgram; | ||
292 | } else { | ||
293 | cur.push_back(*(next->token)); | ||
294 | } | ||
150 | 295 | ||
151 | cur.push_back(*(next->token)); | ||
152 | result.push_back(nextToken); | 296 | result.push_back(nextToken); |
153 | } | 297 | } |
154 | 298 | ||
@@ -159,10 +303,11 @@ std::string canonize(std::string f) | |||
159 | { | 303 | { |
160 | string canonical(f); | 304 | string canonical(f); |
161 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 305 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); |
162 | if (canonical[canonical.length()-1] == '.') | 306 | |
163 | { | 307 | string result; |
164 | canonical.resize(canonical.find('.')); | 308 | remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), [] (char c) { |
165 | } | 309 | return !((c != '.') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); |
310 | }); | ||
166 | 311 | ||
167 | return canonical; | 312 | return result; |
168 | } \ No newline at end of file | 313 | } \ No newline at end of file |
diff --git a/kgramstats.h b/kgramstats.h index b40e1ab..059eb05 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -23,6 +23,11 @@ private: | |||
23 | int titlecase; | 23 | int titlecase; |
24 | int uppercase; | 24 | int uppercase; |
25 | int period; | 25 | int period; |
26 | int startquote; | ||
27 | int endquote; | ||
28 | int startparen; | ||
29 | int endparen; | ||
30 | int comma; | ||
26 | string* token; | 31 | string* token; |
27 | } token_data; | 32 | } token_data; |
28 | int maxK; | 33 | int maxK; |