diff options
| -rw-r--r-- | Makefile.am | 3 | ||||
| -rw-r--r-- | kgramstats.cpp | 187 | ||||
| -rw-r--r-- | kgramstats.h | 5 | 
3 files changed, 173 insertions, 22 deletions
| diff --git a/Makefile.am b/Makefile.am index 299dc10..150ede2 100644 --- a/Makefile.am +++ b/Makefile.am | |||
| @@ -4,5 +4,6 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} | |||
| 4 | bin_PROGRAMS = rawr-ebooks rawr-gen | 4 | bin_PROGRAMS = rawr-ebooks rawr-gen | 
| 5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp | 5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp | 
| 6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp | 6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp | 
| 7 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) | 7 | rawr_gen_CPPFLAGS = -std=c++11 | 
| 8 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -std=c++11 | ||
| 8 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file | 9 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file | 
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 2321b11..1f3dd3c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -28,6 +28,8 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 28 | } | 28 | } | 
| 29 | 29 | ||
| 30 | map<kgram, map<string, token_data*>* > tstats; | 30 | map<kgram, map<string, token_data*>* > tstats; | 
| 31 | bool newSentence = true; | ||
| 32 | bool newClause = false; | ||
| 31 | for (int k=0; k<=maxK; k++) | 33 | for (int k=0; k<=maxK; k++) | 
| 32 | { | 34 | { | 
| 33 | for (int i=0; i<(tokens.size() - k); i++) | 35 | for (int i=0; i<(tokens.size() - k); i++) | 
| @@ -50,11 +52,83 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 50 | token_data* td = tstats[seq]->at(canonical); | 52 | token_data* td = tstats[seq]->at(canonical); | 
| 51 | td->token = new string(canonical); | 53 | td->token = new string(canonical); | 
| 52 | td->all++; | 54 | td->all++; | 
| 55 | |||
| 56 | if (newSentence) | ||
| 57 | { | ||
| 58 | kgram newKgram(1, "."); | ||
| 59 | if (tstats[newKgram] == NULL) | ||
| 60 | { | ||
| 61 | tstats[newKgram] = new map<string, token_data*>(); | ||
| 62 | } | ||
| 63 | |||
| 64 | (*tstats[newKgram])[canonical] = td; | ||
| 65 | |||
| 66 | newSentence = false; | ||
| 67 | } | ||
| 68 | |||
| 69 | if (newClause) | ||
| 70 | { | ||
| 71 | kgram commaKgram(1, ","); | ||
| 72 | if (tstats[commaKgram] == NULL) | ||
| 73 | { | ||
| 74 | tstats[commaKgram] = new map<string, token_data*>(); | ||
| 75 | } | ||
| 76 | |||
| 77 | (*tstats[commaKgram])[canonical] = td; | ||
| 78 | |||
| 79 | newClause = false; | ||
| 80 | } | ||
| 53 | 81 | ||
| 54 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | 82 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | 
| 55 | { | 83 | { | 
| 56 | td->period++; | 84 | td->period++; | 
| 85 | newSentence = true; | ||
| 57 | } | 86 | } | 
| 87 | |||
| 88 | if (f.length() > 0) | ||
| 89 | { | ||
| 90 | if (f[0] == '"') | ||
| 91 | { | ||
| 92 | td->startquote++; | ||
| 93 | } | ||
| 94 | |||
| 95 | if (f[f.length()-1] == '"') | ||
| 96 | { | ||
| 97 | td->endquote++; | ||
| 98 | |||
| 99 | if ((f.length() > 1) && (f[f.length()-2] == ',')) | ||
| 100 | { | ||
| 101 | td->comma++; | ||
| 102 | newClause = true; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | if (f[f.length()-1] == ',') | ||
| 107 | { | ||
| 108 | td->comma++; | ||
| 109 | newClause = true; | ||
| 110 | |||
| 111 | if ((f.length() > 1) && (f[f.length()-2] == '"')) | ||
| 112 | { | ||
| 113 | td->endquote++; | ||
| 114 | } | ||
| 115 | |||
| 116 | if ((f.length() > 1) && (f[f.length()-2] == ')')) | ||
| 117 | { | ||
| 118 | td->endparen++; | ||
| 119 | } | ||
| 120 | } | ||
| 121 | |||
| 122 | if (f[0] == '(') | ||
| 123 | { | ||
| 124 | td->startparen++; | ||
| 125 | } | ||
| 126 | |||
| 127 | if (f[f.length()-1] == ')') | ||
| 128 | { | ||
| 129 | td->endparen++; | ||
| 130 | } | ||
| 131 | } | ||
| 58 | 132 | ||
| 59 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 133 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 
| 60 | { | 134 | { | 
| @@ -97,22 +171,31 @@ void printKgram(kgram k) | |||
| 97 | vector<string> kgramstats::randomSentence(int n) | 171 | vector<string> kgramstats::randomSentence(int n) | 
| 98 | { | 172 | { | 
| 99 | vector<string> result; | 173 | vector<string> result; | 
| 100 | list<string> cur; | 174 | kgram newKgram(1, "."); | 
| 175 | kgram commaKgram(1, ","); | ||
| 176 | list<string> cur = newKgram; | ||
| 177 | int cuts = 0; | ||
| 101 | 178 | ||
| 102 | for (int i=0; i<n; i++) | 179 | for (int i=0; i<n; i++) | 
| 103 | { | 180 | { | 
| 104 | if ((rand() % (maxK - cur.size() + 1)) == 0) | 181 | if ((cur.size() > 0) && (cur != newKgram)) | 
| 105 | { | 182 | { | 
| 106 | for (int i=0; i<cur.size(); i++) | 183 | if (rand() % (maxK - cur.size() + 1) == 0) | 
| 107 | { | 184 | { | 
| 108 | if ((rand() % 3) == 0) | 185 | while (cur.size() > 0) | 
| 109 | { | 186 | { | 
| 110 | cur.pop_front(); | 187 | if ((rand() % (n)) < cuts) | 
| 111 | } else { | 188 | { | 
| 112 | break; | 189 | cur.pop_front(); | 
| 113 | } | 190 | cuts--; | 
| 114 | } | 191 | } else { | 
| 115 | } | 192 | break; | 
| 193 | } | ||
| 194 | } | ||
| 195 | } | ||
| 196 | |||
| 197 | cuts++; | ||
| 198 | } | ||
| 116 | 199 | ||
| 117 | map<int, token_data*> distribution = *(*stats)[cur]; | 200 | map<int, token_data*> distribution = *(*stats)[cur]; | 
| 118 | int max = distribution.rbegin()->first; | 201 | int max = distribution.rbegin()->first; | 
| @@ -122,6 +205,11 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 122 | string nextToken(*(next->token)); | 205 | string nextToken(*(next->token)); | 
| 123 | int casing = rand() % next->all; | 206 | int casing = rand() % next->all; | 
| 124 | int period = rand() % next->all; | 207 | int period = rand() % next->all; | 
| 208 | int startparen = rand() % next->all; | ||
| 209 | int endparen = rand() % next->all; | ||
| 210 | int startquote = rand() % next->all; | ||
| 211 | int endquote = rand() % next->all; | ||
| 212 | int comma = rand() % next->all; | ||
| 125 | if (casing < next->uppercase) | 213 | if (casing < next->uppercase) | 
| 126 | { | 214 | { | 
| 127 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 215 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 
| @@ -129,11 +217,53 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 129 | { | 217 | { | 
| 130 | nextToken[0] = toupper(nextToken[0]); | 218 | nextToken[0] = toupper(nextToken[0]); | 
| 131 | } | 219 | } | 
| 220 | |||
| 221 | if ((cur == newKgram) && (rand() % 3 < 2)) | ||
| 222 | { | ||
| 223 | nextToken[0] = toupper(nextToken[0]); | ||
| 224 | } | ||
| 225 | |||
| 226 | if (startquote < next->startquote) | ||
| 227 | { | ||
| 228 | nextToken = "\"" + nextToken; | ||
| 229 | } else if (startparen < next->startparen) | ||
| 230 | { | ||
| 231 | nextToken = "(" + nextToken; | ||
| 232 | } | ||
| 132 | 233 | ||
| 133 | if (period < next->period) | 234 | if (period < next->period) | 
| 134 | { | 235 | { | 
| 135 | nextToken += "."; | 236 | if (endquote < next->endquote) | 
| 136 | } | 237 | { | 
| 238 | nextToken += "\""; | ||
| 239 | } else if (endparen < next->endparen) | ||
| 240 | { | ||
| 241 | nextToken += ")"; | ||
| 242 | } | ||
| 243 | |||
| 244 | int type = rand() % 6; | ||
| 245 | |||
| 246 | if (type < 3) | ||
| 247 | { | ||
| 248 | nextToken += "."; | ||
| 249 | } else if (type < 5) | ||
| 250 | { | ||
| 251 | nextToken += "!"; | ||
| 252 | } else { | ||
| 253 | nextToken += "?"; | ||
| 254 | } | ||
| 255 | } else if (comma < next->comma) | ||
| 256 | { | ||
| 257 | if (endquote < next->endquote) | ||
| 258 | { | ||
| 259 | nextToken += "\""; | ||
| 260 | } else if (endparen < next->endparen) | ||
| 261 | { | ||
| 262 | nextToken += ")"; | ||
| 263 | } | ||
| 264 | |||
| 265 | nextToken += ","; | ||
| 266 | } | ||
| 137 | 267 | ||
| 138 | if (cur.size() == maxK) | 268 | if (cur.size() == maxK) | 
| 139 | { | 269 | { | 
| @@ -147,8 +277,22 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 147 | } | 277 | } | 
| 148 | 278 | ||
| 149 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; | 279 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; | 
| 280 | |||
| 281 | if ((cur == newKgram) || (cur == commaKgram)) | ||
| 282 | { | ||
| 283 | cur.pop_front(); | ||
| 284 | } | ||
| 285 | |||
| 286 | if ((period < next->period) && ((rand() % 2) == 0)) | ||
| 287 | { | ||
| 288 | cur = newKgram; | ||
| 289 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | ||
| 290 | { | ||
| 291 | cur = commaKgram; | ||
| 292 | } else { | ||
| 293 | cur.push_back(*(next->token)); | ||
| 294 | } | ||
| 150 | 295 | ||
| 151 | cur.push_back(*(next->token)); | ||
| 152 | result.push_back(nextToken); | 296 | result.push_back(nextToken); | 
| 153 | } | 297 | } | 
| 154 | 298 | ||
| @@ -159,10 +303,11 @@ std::string canonize(std::string f) | |||
| 159 | { | 303 | { | 
| 160 | string canonical(f); | 304 | string canonical(f); | 
| 161 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 305 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 
| 162 | if (canonical[canonical.length()-1] == '.') | 306 | |
| 163 | { | 307 | string result; | 
| 164 | canonical.resize(canonical.find('.')); | 308 | remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), [] (char c) { | 
| 165 | } | 309 | return !((c != '.') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); | 
| 310 | }); | ||
| 166 | 311 | ||
| 167 | return canonical; | 312 | return result; | 
| 168 | } \ No newline at end of file | 313 | } \ No newline at end of file | 
| diff --git a/kgramstats.h b/kgramstats.h index b40e1ab..059eb05 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -23,6 +23,11 @@ private: | |||
| 23 | int titlecase; | 23 | int titlecase; | 
| 24 | int uppercase; | 24 | int uppercase; | 
| 25 | int period; | 25 | int period; | 
| 26 | int startquote; | ||
| 27 | int endquote; | ||
| 28 | int startparen; | ||
| 29 | int endparen; | ||
| 30 | int comma; | ||
| 26 | string* token; | 31 | string* token; | 
| 27 | } token_data; | 32 | } token_data; | 
| 28 | int maxK; | 33 | int maxK; | 
