diff options
-rw-r--r-- | kgramstats.cpp | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 37830e4..7ece80f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -396,6 +396,33 @@ void rawr::compile(int maxK) | |||
396 | int corpid = 0; | 396 | int corpid = 0; |
397 | for (auto corpus : tokens) | 397 | for (auto corpus : tokens) |
398 | { | 398 | { |
399 | for (int k=0; k<maxK && k<corpus.size(); k++) | ||
400 | { | ||
401 | // The zero'th token should be a terminator. | ||
402 | token_id fid = corpus[k]; | ||
403 | const token& f = _tokenstore.get(fid); | ||
404 | |||
405 | kgram term_prefix(corpus.begin(), corpus.begin()+k); | ||
406 | term_prefix.push_front(wildcardQuery); | ||
407 | |||
408 | if (tstats[term_prefix].count(fid) == 0) | ||
409 | { | ||
410 | tstats[term_prefix].emplace(fid, fid); | ||
411 | } | ||
412 | |||
413 | token_data& td2 = tstats[term_prefix].at(fid); | ||
414 | td2.all++; | ||
415 | td2.corpora.insert(corpid); | ||
416 | |||
417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
418 | { | ||
419 | td2.uppercase++; | ||
420 | } else if (isupper(f.raw[0])) | ||
421 | { | ||
422 | td2.titlecase++; | ||
423 | } | ||
424 | } | ||
425 | |||
399 | for (int k=1; k<maxK && k<corpus.size(); k++) | 426 | for (int k=1; k<maxK && k<corpus.size(); k++) |
400 | { | 427 | { |
401 | for (int i=0; i<(corpus.size() - k); i++) | 428 | for (int i=0; i<(corpus.size() - k); i++) |