diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-31 21:39:39 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-31 21:39:39 -0400 |
commit | 5ce05b81520d06a78165c5c5039007c9f29d4b23 (patch) | |
tree | 118eb3d689a0dfe2581b4a941efa242708dda31c /kgramstats.h | |
parent | ae60f5f679da06b3824accdd14482189fec9dc85 (diff) | |
download | rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.gz rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.tar.bz2 rawr-ebooks-5ce05b81520d06a78165c5c5039007c9f29d4b23.zip |
Added ability to require a minimum number of corpora in generated output
Also fixed a bug with tokenizing multiple corpora.
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 4 |
1 files changed, 4 insertions, 0 deletions
diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <vector> | 7 | #include <vector> |
8 | #include "histogram.h" | 8 | #include "histogram.h" |
9 | #include <functional> | 9 | #include <functional> |
10 | #include <set> | ||
10 | 11 | ||
11 | class rawr { | 12 | class rawr { |
12 | public: | 13 | public: |
@@ -16,6 +17,7 @@ class rawr { | |||
16 | void compile(int maxK); | 17 | void compile(int maxK); |
17 | 18 | ||
18 | void setTransformCallback(transform_callback _arg); | 19 | void setTransformCallback(transform_callback _arg); |
20 | void setMinCorpora(int _arg); | ||
19 | std::string randomSentence(int maxL); | 21 | std::string randomSentence(int maxL); |
20 | 22 | ||
21 | private: | 23 | private: |
@@ -125,6 +127,7 @@ class rawr { | |||
125 | int titlecase; | 127 | int titlecase; |
126 | int uppercase; | 128 | int uppercase; |
127 | token tok; | 129 | token tok; |
130 | std::set<int> corpora; | ||
128 | 131 | ||
129 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
130 | }; | 133 | }; |
@@ -139,6 +142,7 @@ class rawr { | |||
139 | std::vector<std::string> _corpora; | 142 | std::vector<std::string> _corpora; |
140 | std::map<kgram, std::map<int, token_data>> _stats; | 143 | std::map<kgram, std::map<int, token_data>> _stats; |
141 | transform_callback _transform; | 144 | transform_callback _transform; |
145 | int _min_corpora = 1; | ||
142 | 146 | ||
143 | // Words | 147 | // Words |
144 | std::map<std::string, word> words; | 148 | std::map<std::string, word> words; |