diff options
| -rw-r--r-- | kgramstats.cpp | 27 |
1 files changed, 27 insertions, 0 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 37830e4..7ece80f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -396,6 +396,33 @@ void rawr::compile(int maxK) | |||
| 396 | int corpid = 0; | 396 | int corpid = 0; |
| 397 | for (auto corpus : tokens) | 397 | for (auto corpus : tokens) |
| 398 | { | 398 | { |
| 399 | for (int k=0; k<maxK && k<corpus.size(); k++) | ||
| 400 | { | ||
| 401 | // The zero'th token should be a terminator. | ||
| 402 | token_id fid = corpus[k]; | ||
| 403 | const token& f = _tokenstore.get(fid); | ||
| 404 | |||
| 405 | kgram term_prefix(corpus.begin(), corpus.begin()+k); | ||
| 406 | term_prefix.push_front(wildcardQuery); | ||
| 407 | |||
| 408 | if (tstats[term_prefix].count(fid) == 0) | ||
| 409 | { | ||
| 410 | tstats[term_prefix].emplace(fid, fid); | ||
| 411 | } | ||
| 412 | |||
| 413 | token_data& td2 = tstats[term_prefix].at(fid); | ||
| 414 | td2.all++; | ||
| 415 | td2.corpora.insert(corpid); | ||
| 416 | |||
| 417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
| 418 | { | ||
| 419 | td2.uppercase++; | ||
| 420 | } else if (isupper(f.raw[0])) | ||
| 421 | { | ||
| 422 | td2.titlecase++; | ||
| 423 | } | ||
| 424 | } | ||
| 425 | |||
| 399 | for (int k=1; k<maxK && k<corpus.size(); k++) | 426 | for (int k=1; k<maxK && k<corpus.size(); k++) |
| 400 | { | 427 | { |
| 401 | for (int i=0; i<(corpus.size() - k); i++) | 428 | for (int i=0; i<(corpus.size() - k); i++) |
