diff options
| -rw-r--r-- | kgramstats.cpp | 85 |
1 files changed, 41 insertions, 44 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 17598de..41517ca 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -206,7 +206,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 206 | 206 | ||
| 207 | for (int i=0; i<n; i++) | 207 | for (int i=0; i<n; i++) |
| 208 | { | 208 | { |
| 209 | if ((cur.size() > 0) && (cur != newKgram)) | 209 | /*if ((cur.size() > 0) && (cur != newKgram)) |
| 210 | { | 210 | { |
| 211 | if (rand() % (maxK - cur.size() + 1) == 0) | 211 | if (rand() % (maxK - cur.size() + 1) == 0) |
| 212 | { | 212 | { |
| @@ -223,7 +223,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 223 | } | 223 | } |
| 224 | 224 | ||
| 225 | cuts++; | 225 | cuts++; |
| 226 | } | 226 | }*/ |
| 227 | 227 | ||
| 228 | std::map<int, token_data*> distribution = *(*stats)[cur]; | 228 | std::map<int, token_data*> distribution = *(*stats)[cur]; |
| 229 | int max = distribution.rbegin()->first; | 229 | int max = distribution.rbegin()->first; |
| @@ -241,12 +241,9 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 241 | if (casing < next->uppercase) | 241 | if (casing < next->uppercase) |
| 242 | { | 242 | { |
| 243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
| 244 | } else if ((casing - next->uppercase) < next->titlecase) | ||
| 245 | { | ||
| 246 | nextToken[0] = toupper(nextToken[0]); | ||
| 247 | } | 244 | } |
| 248 | 245 | ||
| 249 | if ((cur == newKgram) && (rand() % 3 < 2)) | 246 | if ((cur == newKgram) && (rand() % 15 > 0)) |
| 250 | { | 247 | { |
| 251 | nextToken[0] = toupper(nextToken[0]); | 248 | nextToken[0] = toupper(nextToken[0]); |
| 252 | } | 249 | } |
| @@ -255,48 +252,48 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 255 | if (mess) | 252 | if (mess) |
| 256 | { | 253 | { |
| 257 | nextToken = mstats.alternate(nextToken); | 254 | nextToken = mstats.alternate(nextToken); |
| 255 | } | ||
| 258 | 256 | ||
| 259 | if (startquote < next->startquote) | 257 | if (startquote < next->startquote) |
| 258 | { | ||
| 259 | nextToken = "\"" + nextToken; | ||
| 260 | } else if (startparen < next->startparen) | ||
| 261 | { | ||
| 262 | nextToken = "(" + nextToken; | ||
| 263 | } | ||
| 264 | |||
| 265 | if (period < next->period) | ||
| 266 | { | ||
| 267 | if (endquote < next->endquote) | ||
| 260 | { | 268 | { |
| 261 | nextToken = "\"" + nextToken; | 269 | nextToken += "\""; |
| 262 | } else if (startparen < next->startparen) | 270 | } else if (endparen < next->endparen) |
| 263 | { | 271 | { |
| 264 | nextToken = "(" + nextToken; | 272 | nextToken += ")"; |
| 265 | } | 273 | } |
| 266 | 274 | ||
| 267 | if (period < next->period) | 275 | int type = rand() % 6; |
| 268 | { | 276 | |
| 269 | if (endquote < next->endquote) | 277 | if (type < 3) |
| 270 | { | ||
| 271 | nextToken += "\""; | ||
| 272 | } else if (endparen < next->endparen) | ||
| 273 | { | ||
| 274 | nextToken += ")"; | ||
| 275 | } | ||
| 276 | |||
| 277 | int type = rand() % 6; | ||
| 278 | |||
| 279 | if (type < 3) | ||
| 280 | { | ||
| 281 | nextToken += "."; | ||
| 282 | } else if (type < 5) | ||
| 283 | { | ||
| 284 | nextToken += "!"; | ||
| 285 | } else { | ||
| 286 | nextToken += "?"; | ||
| 287 | } | ||
| 288 | } else if (comma < next->comma) | ||
| 289 | { | 278 | { |
| 290 | if (endquote < next->endquote) | 279 | nextToken += "."; |
| 291 | { | 280 | } else if (type < 5) |
| 292 | nextToken += "\""; | 281 | { |
| 293 | } else if (endparen < next->endparen) | 282 | nextToken += "!"; |
| 294 | { | 283 | } else { |
| 295 | nextToken += ")"; | 284 | nextToken += "?"; |
| 296 | } | ||
| 297 | |||
| 298 | nextToken += ","; | ||
| 299 | } | 285 | } |
| 286 | } else if (comma < next->comma) | ||
| 287 | { | ||
| 288 | if (endquote < next->endquote) | ||
| 289 | { | ||
| 290 | nextToken += "\""; | ||
| 291 | } else if (endparen < next->endparen) | ||
| 292 | { | ||
| 293 | nextToken += ")"; | ||
| 294 | } | ||
| 295 | |||
| 296 | nextToken += ","; | ||
| 300 | } | 297 | } |
| 301 | 298 | ||
| 302 | if (cur.size() == maxK) | 299 | if (cur.size() == maxK) |
| @@ -324,7 +321,7 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 324 | cur.pop_front(); | 321 | cur.pop_front(); |
| 325 | } | 322 | } |
| 326 | 323 | ||
| 327 | if ((period < next->period) && ((rand() % 2) == 0)) | 324 | if ((period < next->period) && ((rand() % 3) == 0)) |
| 328 | { | 325 | { |
| 329 | cur = newKgram; | 326 | cur = newKgram; |
| 330 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | 327 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) |
| @@ -361,5 +358,5 @@ std::string canonize(std::string f) | |||
| 361 | std::string result; | 358 | std::string result; |
| 362 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 359 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
| 363 | 360 | ||
| 364 | return canonical; | 361 | return result; |
| 365 | } | 362 | } |
