From 5ce05b81520d06a78165c5c5039007c9f29d4b23 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Tue, 31 May 2016 21:39:39 -0400 Subject: Added ability to require a minimum number of corpora in generated output Also fixed a bug with tokenizing multiple corpora. --- kgramstats.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kgramstats.h') diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -7,6 +7,7 @@ #include #include "histogram.h" #include +#include class rawr { public: @@ -16,6 +17,7 @@ class rawr { void compile(int maxK); void setTransformCallback(transform_callback _arg); + void setMinCorpora(int _arg); std::string randomSentence(int maxL); private: @@ -125,6 +127,7 @@ class rawr { int titlecase; int uppercase; token tok; + std::set corpora; token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} }; @@ -139,6 +142,7 @@ class rawr { std::vector _corpora; std::map> _stats; transform_callback _transform; + int _min_corpora = 1; // Words std::map words; -- cgit 1.4.1