diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-27 21:52:13 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-27 21:52:13 -0500 |
commit | 47d9ba41accf6410bb9cf26b49a1c76129a49975 (patch) | |
tree | 069ba764a56561070df3ee7990f83a2935a96b9e | |
parent | d7addd0adc6b42a1e10d92b96d27ab484049dd8e (diff) | |
download | rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.gz rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.bz2 rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.zip |
The beginning of a corpus should be treated as a new sentence
-rw-r--r-- | kgramstats.cpp | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 37830e4..7ece80f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -396,6 +396,33 @@ void rawr::compile(int maxK) | |||
396 | int corpid = 0; | 396 | int corpid = 0; |
397 | for (auto corpus : tokens) | 397 | for (auto corpus : tokens) |
398 | { | 398 | { |
399 | for (int k=0; k<maxK && k<corpus.size(); k++) | ||
400 | { | ||
401 | // The zero'th token should be a terminator. | ||
402 | token_id fid = corpus[k]; | ||
403 | const token& f = _tokenstore.get(fid); | ||
404 | |||
405 | kgram term_prefix(corpus.begin(), corpus.begin()+k); | ||
406 | term_prefix.push_front(wildcardQuery); | ||
407 | |||
408 | if (tstats[term_prefix].count(fid) == 0) | ||
409 | { | ||
410 | tstats[term_prefix].emplace(fid, fid); | ||
411 | } | ||
412 | |||
413 | token_data& td2 = tstats[term_prefix].at(fid); | ||
414 | td2.all++; | ||
415 | td2.corpora.insert(corpid); | ||
416 | |||
417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
418 | { | ||
419 | td2.uppercase++; | ||
420 | } else if (isupper(f.raw[0])) | ||
421 | { | ||
422 | td2.titlecase++; | ||
423 | } | ||
424 | } | ||
425 | |||
399 | for (int k=1; k<maxK && k<corpus.size(); k++) | 426 | for (int k=1; k<maxK && k<corpus.size(); k++) |
400 | { | 427 | { |
401 | for (int i=0; i<(corpus.size() - k); i++) | 428 | for (int i=0; i<(corpus.size() - k); i++) |