about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
authorFeffernoose <fefferburbia@gmail.com>2013-10-01 21:29:15 -0400
committerFeffernoose <fefferburbia@gmail.com>2013-10-01 21:29:15 -0400
commit420a7a1e004410f1377a6d919d72d18f8ae34bdf (patch)
tree33c0fc579e8f4e3d93757d886354309786941a13 /kgramstats.h
parent8de3134bf2cd26ff81359df703e5fbc6280448d7 (diff)
downloadrawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.gz
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.bz2
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.zip
Weighed token casing and presence of periods
Tokens which differ only by casing or the presence of an ending period are
now considered the same token. When tokens are generated, they are cased
based on the prevalence of Upper/Title/Lower casing of the token in the
input corpus, and similarly, a period is added to the end of the word based
on how often the same token was ended with a period in the input corpus.
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h12
1 files changed, 9 insertions, 3 deletions
diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -14,15 +14,21 @@ class kgramstats
14{ 14{
15public: 15public:
16 kgramstats(string corpus, int maxK); 16 kgramstats(string corpus, int maxK);
17 map<string, int>* lookupExts(kgram tk);
18 int getMaxK();
19 vector<string> randomSentence(int n); 17 vector<string> randomSentence(int n);
20 18
21private: 19private:
20 typedef struct
21 {
22 int all;
23 int titlecase;
24 int uppercase;
25 int period;
26 } token_data;
22 int maxK; 27 int maxK;
23 map<kgram, map<string, int>* >* stats; 28 map<kgram, map<string, token_data*>* >* stats;
24}; 29};
25 30
26void printKgram(kgram k); 31void printKgram(kgram k);
32std::string canonize(std::string f);
27 33
28#endif \ No newline at end of file 34#endif \ No newline at end of file