about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2019-02-27 21:52:13 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2019-02-27 21:52:13 -0500
commit47d9ba41accf6410bb9cf26b49a1c76129a49975 (patch)
tree069ba764a56561070df3ee7990f83a2935a96b9e /kgramstats.cpp
parentd7addd0adc6b42a1e10d92b96d27ab484049dd8e (diff)
downloadrawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.gz
rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.tar.bz2
rawr-ebooks-47d9ba41accf6410bb9cf26b49a1c76129a49975.zip
The beginning of a corpus should be treated as a new sentence
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp27
1 files changed, 27 insertions, 0 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 37830e4..7ece80f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -396,6 +396,33 @@ void rawr::compile(int maxK)
396 int corpid = 0; 396 int corpid = 0;
397 for (auto corpus : tokens) 397 for (auto corpus : tokens)
398 { 398 {
399 for (int k=0; k<maxK && k<corpus.size(); k++)
400 {
401 // The zero'th token should be a terminator.
402 token_id fid = corpus[k];
403 const token& f = _tokenstore.get(fid);
404
405 kgram term_prefix(corpus.begin(), corpus.begin()+k);
406 term_prefix.push_front(wildcardQuery);
407
408 if (tstats[term_prefix].count(fid) == 0)
409 {
410 tstats[term_prefix].emplace(fid, fid);
411 }
412
413 token_data& td2 = tstats[term_prefix].at(fid);
414 td2.all++;
415 td2.corpora.insert(corpid);
416
417 if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
418 {
419 td2.uppercase++;
420 } else if (isupper(f.raw[0]))
421 {
422 td2.titlecase++;
423 }
424 }
425
399 for (int k=1; k<maxK && k<corpus.size(); k++) 426 for (int k=1; k<maxK && k<corpus.size(); k++)
400 { 427 {
401 for (int i=0; i<(corpus.size() - k); i++) 428 for (int i=0; i<(corpus.size() - k); i++)