Weighed token casing and presence of periods

Tokens which differ only by casing or the presence of an ending period are now considered the same token. When tokens are generated, they are cased based on the prevalence of Upper/Title/Lower casing of the token in the input corpus, and similarly, a period is added to the end of the word based on how often the same token was ended with a period in the input corpus.
author: Feffernoose <fefferburbia@gmail.com> 2013-10-01 21:29:15 -0400
committer: Feffernoose <fefferburbia@gmail.com> 2013-10-01 21:29:15 -0400
commit: 420a7a1e004410f1377a6d919d72d18f8ae34bdf (patch)
tree: 33c0fc579e8f4e3d93757d886354309786941a13 /kgramstats.h
parent: 8de3134bf2cd26ff81359df703e5fbc6280448d7 (diff)
download: rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.gz
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.bz2
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.zip
1 files changed, 9 insertions, 3 deletions
diff --git a/kgramstats.h b/kgramstats.h
index 069bb90..248b193 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -14,15 +14,21 @@ class kgramstats
 {
 public:
        kgramstats(string corpus, int maxK);
-        map<string, int>* lookupExts(kgram tk);
-        int getMaxK();
        vector<string> randomSentence(int n);
        
 private:
+        typedef struct
+        {
+                int all;
+                int titlecase;
+                int uppercase;
+                int period;
+        } token_data;
        int maxK;
-        map<kgram, map<string, int>* >* stats;
+        map<kgram, map<string, token_data*>* >* stats;
 };
 void printKgram(kgram k);
+std::string canonize(std::string f);
 #endif
 \ No newline at end of file
author	Feffernoose <fefferburbia@gmail.com>	2013-10-01 21:29:15 -0400
committer	Feffernoose <fefferburbia@gmail.com>	2013-10-01 21:29:15 -0400
commit	420a7a1e004410f1377a6d919d72d18f8ae34bdf (patch)
tree	33c0fc579e8f4e3d93757d886354309786941a13 /kgramstats.h
parent	8de3134bf2cd26ff81359df703e5fbc6280448d7 (diff)
download	rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.gz rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.bz2 rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.zip