From 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Fri, 20 May 2016 23:14:06 -0400 Subject: Pulled the ebooks functionality out into a library --- kgramstats.h | 201 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 106 insertions(+), 95 deletions(-) (limited to 'kgramstats.h') diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -1,124 +1,135 @@ +#ifndef KGRAMSTATS_H +#define KGRAMSTATS_H + #include #include #include #include #include "histogram.h" +#include -#ifndef KGRAMSTATS_H -#define KGRAMSTATS_H - -struct word { - std::string canon; - histogram forms; - histogram terms; +class rawr { + public: + typedef std::function transform_callback; + + void addCorpus(std::string corpus); + void compile(int maxK); + + void setTransformCallback(transform_callback _arg); + std::string randomSentence(int maxL); + + private: + struct word { + std::string canon; + histogram forms; + histogram terms; - word(std::string canon) : canon(canon) {} + word(std::string canon) : canon(canon) {} - bool operator<(const word& other) const - { - return canon < other.canon; - } -}; - -extern word blank_word; + bool operator<(const word& other) const + { + return canon < other.canon; + } + }; -enum class suffixtype { - none, - terminating, - comma -}; + enum class suffixtype { + none, + terminating, + comma + }; -enum class parentype { - paren, - square_bracket, - asterisk, - quote -}; + enum class parentype { + paren, + square_bracket, + asterisk, + quote + }; -enum class doublestatus { - opening, - closing, - both -}; + enum class doublestatus { + opening, + closing, + both + }; -struct delimiter { - parentype type; - doublestatus status; + struct delimiter { + parentype type; + doublestatus status; - delimiter(parentype type, doublestatus status) : type(type), status(status) {} + delimiter(parentype type, doublestatus status) : type(type), status(status) {} - bool operator<(const delimiter& other) const - { - return std::tie(type, status) < std::tie(other.type, other.status); - } -}; + bool operator<(const delimiter& other) const + { + return std::tie(type, status) < std::tie(other.type, other.status); + } + }; -struct token { - const word& w; - std::map delimiters; - suffixtype suffix; - std::string raw; + struct token { + const word& w; + std::map delimiters; + suffixtype suffix; + std::string raw; - token(const word& w) : w(w), suffix(suffixtype::none) {} + token(const word& w) : w(w), suffix(suffixtype::none) {} - bool operator<(const token& other) const - { - return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); - } -}; + bool operator<(const token& other) const + { + return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); + } + }; -enum class querytype { - literal, - sentence -}; + enum class querytype { + literal, + sentence + }; -struct query { - querytype type; - token tok; + struct query { + querytype type; + token tok; - query(token tok) : tok(tok), type(querytype::literal) {} + query(token tok) : tok(tok), type(querytype::literal) {} - query(querytype type) : tok(blank_word), type(type) {} + query(querytype type) : tok(blank_word), type(type) {} - bool operator<(const query& other) const - { - if (type == other.type) - { - return tok < other.tok; - } else { - return type < other.type; - } - } -}; - -typedef std::list kgram; + bool operator<(const query& other) const + { + if (type == other.type) + { + return tok < other.tok; + } else { + return type < other.type; + } + } + }; + + static const query wildcardQuery; + static const word blank_word; -class kgramstats -{ -public: - kgramstats(std::string corpus, int maxK); - std::string randomSentence(int maxL); - -private: - struct token_data - { - int all; - int titlecase; - int uppercase; - token tok; + typedef std::list kgram; - token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} - }; + struct token_data + { + int all; + int titlecase; + int uppercase; + token tok; + + token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} + }; + + friend std::ostream& operator<<(std::ostream& os, kgram k); + friend std::ostream& operator<<(std::ostream& os, query q); + friend std::ostream& operator<<(std::ostream& os, token t); - int maxK; - std::map > stats; + int _maxK; + bool _compiled = false; + std::vector _corpora; + std::map> _stats; + transform_callback _transform; - // Words - std::map words; - word hashtags {"#hashtag"}; - word emoticons {"👌"}; + // Words + std::map words; + word hashtags {"#hashtag"}; + word emoticons {"👌"}; }; -void printKgram(kgram k); - #endif \ No newline at end of file -- cgit 1.4.1