about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-31 21:39:39 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-31 21:39:39 -0400
commit5ce05b81520d06a78165c5c5039007c9f29d4b23 (patch)
tree118eb3d689a0dfe2581b4a941efa242708dda31c /kgramstats.h
parentae60f5f679da06b3824accdd14482189fec9dc85 (diff)
downloadrawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.gz
rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.bz2
rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.zip
Added ability to require a minimum number of corpora in generated output
Also fixed a bug with tokenizing multiple corpora.
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h4
1 files changed, 4 insertions, 0 deletions
diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,6 +7,7 @@
7#include <vector> 7#include <vector>
8#include "histogram.h" 8#include "histogram.h"
9#include <functional> 9#include <functional>
10#include <set>
10 11
11class rawr { 12class rawr {
12 public: 13 public:
@@ -16,6 +17,7 @@ class rawr {
16 void compile(int maxK); 17 void compile(int maxK);
17 18
18 void setTransformCallback(transform_callback _arg); 19 void setTransformCallback(transform_callback _arg);
20 void setMinCorpora(int _arg);
19 std::string randomSentence(int maxL); 21 std::string randomSentence(int maxL);
20 22
21 private: 23 private:
@@ -125,6 +127,7 @@ class rawr {
125 int titlecase; 127 int titlecase;
126 int uppercase; 128 int uppercase;
127 token tok; 129 token tok;
130 std::set<int> corpora;
128 131
129 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 132 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
130 }; 133 };
@@ -139,6 +142,7 @@ class rawr {
139 std::vector<std::string> _corpora; 142 std::vector<std::string> _corpora;
140 std::map<kgram, std::map<int, token_data>> _stats; 143 std::map<kgram, std::map<int, token_data>> _stats;
141 transform_callback _transform; 144 transform_callback _transform;
145 int _min_corpora = 1;
142 146
143 // Words 147 // Words
144 std::map<std::string, word> words; 148 std::map<std::string, word> words;