diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-27 21:52:13 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-27 21:52:13 -0500 |
| commit | 47d9ba41accf6410bb9cf26b49a1c76129a49975 (patch) | |
| tree | 069ba764a56561070df3ee7990f83a2935a96b9e | |
| parent | d7addd0adc6b42a1e10d92b96d27ab484049dd8e (diff) | |
| download | rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.gz rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.bz2 rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.zip | |
The beginning of a corpus should be treated as a new sentence
| -rw-r--r-- | kgramstats.cpp | 27 |
1 files changed, 27 insertions, 0 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 37830e4..7ece80f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -396,6 +396,33 @@ void rawr::compile(int maxK) | |||
| 396 | int corpid = 0; | 396 | int corpid = 0; |
| 397 | for (auto corpus : tokens) | 397 | for (auto corpus : tokens) |
| 398 | { | 398 | { |
| 399 | for (int k=0; k<maxK && k<corpus.size(); k++) | ||
| 400 | { | ||
| 401 | // The zero'th token should be a terminator. | ||
| 402 | token_id fid = corpus[k]; | ||
| 403 | const token& f = _tokenstore.get(fid); | ||
| 404 | |||
| 405 | kgram term_prefix(corpus.begin(), corpus.begin()+k); | ||
| 406 | term_prefix.push_front(wildcardQuery); | ||
| 407 | |||
| 408 | if (tstats[term_prefix].count(fid) == 0) | ||
| 409 | { | ||
| 410 | tstats[term_prefix].emplace(fid, fid); | ||
| 411 | } | ||
| 412 | |||
| 413 | token_data& td2 = tstats[term_prefix].at(fid); | ||
| 414 | td2.all++; | ||
| 415 | td2.corpora.insert(corpid); | ||
| 416 | |||
| 417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
| 418 | { | ||
| 419 | td2.uppercase++; | ||
| 420 | } else if (isupper(f.raw[0])) | ||
| 421 | { | ||
| 422 | td2.titlecase++; | ||
| 423 | } | ||
| 424 | } | ||
| 425 | |||
| 399 | for (int k=1; k<maxK && k<corpus.size(); k++) | 426 | for (int k=1; k<maxK && k<corpus.size(); k++) |
| 400 | { | 427 | { |
| 401 | for (int i=0; i<(corpus.size() - k); i++) | 428 | for (int i=0; i<(corpus.size() - k); i++) |
