From b316e309559d7176af6cf0bb7dcd6dbaa83c01cd Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Fri, 29 Jan 2016 12:43:00 -0500 Subject: Rewrote how tokens are handled A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it. --- kgramstats.h | 124 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 60 deletions(-) (limited to 'kgramstats.h') diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -2,61 +2,89 @@ #include #include #include -#include "malaprop.h" +#include "histogram.h" #ifndef KGRAMSTATS_H #define KGRAMSTATS_H -enum tokentype { - tokentype_literal, - tokentype_hashtag +struct word { + std::string canon; + histogram forms; + histogram terms; + + word(std::string canon) : canon(canon) {} + + bool operator<(const word& other) const + { + return canon < other.canon; + } }; -struct token { - tokentype type; - std::string canon; - bool terminating; +extern word blank_word; + +enum class suffixtype { + none, + terminating, + comma +}; + +enum class parentype { + paren, + square_bracket, + asterisk, + quote +}; + +enum class doublestatus { + opening, + closing, + both +}; + +struct delimiter { + parentype type; + doublestatus status; + + delimiter(parentype type, doublestatus status) : type(type), status(status) {} - token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} - token(tokentype type) : type(type), canon(""), terminating(false) {} + bool operator<(const delimiter& other) const + { + return std::tie(type, status) < std::tie(other.type, other.status); + } +}; + +struct token { + const word& w; + std::map delimiters; + suffixtype suffix; + std::string raw; + + token(const word& w) : w(w), suffix(suffixtype::none) {} bool operator<(const token& other) const { - if (type != other.type) - { - return type < other.type; - } else if (type == tokentype_literal) - { - if (canon == other.canon) - { - return !terminating && other.terminating; - } else { - return canon < other.canon; - } - } else { - return !terminating && other.terminating; - } + return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); } }; -enum querytype { - querytype_literal, - querytype_sentence +enum class querytype { + literal, + sentence }; struct query { querytype type; - token word; + token tok; - query(token word) : word(word), type(querytype_literal) {} + query(token tok) : tok(tok), type(querytype::literal) {} - query(querytype type) : word(""), type(type) {} + query(querytype type) : tok(blank_word), type(type) {} bool operator<(const query& other) const { if (type == other.type) { - return word < other.word; + return tok < other.tok; } else { return type < other.type; } @@ -65,34 +93,11 @@ struct query { typedef std::list kgram; -struct termstats { - char terminator; - int occurrences; - - termstats() : terminator('.'), occurrences(1) {} - - termstats(char terminator, int occurrences) - { - this->terminator = terminator; - this->occurrences = occurrences; - } - - bool operator<(const termstats& other) const - { - if (terminator == other.terminator) - { - return occurrences < other.occurrences; - } else { - return terminator < other.terminator; - } - } -}; - class kgramstats { public: kgramstats(std::string corpus, int maxK); - std::vector randomSentence(int n); + std::string randomSentence(int n); private: struct token_data @@ -100,16 +105,15 @@ private: int all; int titlecase; int uppercase; - token word; + token tok; - token_data() : word(""), all(0), titlecase(0), uppercase(0) {} + token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} }; int maxK; std::map > stats; - malaprop mstats; - std::map > endings; - std::vector hashtags; + word hashtags {"#hashtag"}; + std::map words; }; void printKgram(kgram k); -- cgit 1.4.1