diff options
Diffstat (limited to 'kgramstats.h')
| -rw-r--r-- | kgramstats.h | 124 |
1 files changed, 64 insertions, 60 deletions
| diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -2,61 +2,89 @@ | |||
| 2 | #include <map> | 2 | #include <map> |
| 3 | #include <list> | 3 | #include <list> |
| 4 | #include <vector> | 4 | #include <vector> |
| 5 | #include "malaprop.h" | 5 | #include "histogram.h" |
| 6 | 6 | ||
| 7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 9 | 9 | ||
| 10 | enum tokentype { | 10 | struct word { |
| 11 | tokentype_literal, | 11 | std::string canon; |
| 12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
| 13 | histogram<std::string> terms; | ||
| 14 | |||
| 15 | word(std::string canon) : canon(canon) {} | ||
| 16 | |||
| 17 | bool operator<(const word& other) const | ||
| 18 | { | ||
| 19 | return canon < other.canon; | ||
| 20 | } | ||
| 13 | }; | 21 | }; |
| 14 | 22 | ||
| 15 | struct token { | 23 | extern word blank_word; |
| 16 | tokentype type; | 24 | |
| 17 | std::string canon; | 25 | enum class suffixtype { |
| 18 | bool terminating; | 26 | none, |
| 27 | terminating, | ||
| 28 | comma | ||
| 29 | }; | ||
| 30 | |||
| 31 | enum class parentype { | ||
| 32 | paren, | ||
| 33 | square_bracket, | ||
| 34 | asterisk, | ||
| 35 | quote | ||
| 36 | }; | ||
| 37 | |||
| 38 | enum class doublestatus { | ||
| 39 | opening, | ||
| 40 | closing, | ||
| 41 | both | ||
| 42 | }; | ||
| 43 | |||
| 44 | struct delimiter { | ||
| 45 | parentype type; | ||
| 46 | doublestatus status; | ||
| 47 | |||
| 48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
| 19 | 49 | ||
| 20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
| 21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
| 52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
| 53 | } | ||
| 54 | }; | ||
| 55 | |||
| 56 | struct token { | ||
| 57 | const word& w; | ||
| 58 | std::map<delimiter, int> delimiters; | ||
| 59 | suffixtype suffix; | ||
| 60 | std::string raw; | ||
| 61 | |||
| 62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
| 22 | 63 | ||
| 23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
| 24 | { | 65 | { |
| 25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
| 26 | { | ||
| 27 | return type < other.type; | ||
| 28 | } else if (type == tokentype_literal) | ||
| 29 | { | ||
| 30 | if (canon == other.canon) | ||
| 31 | { | ||
| 32 | return !terminating && other.terminating; | ||
| 33 | } else { | ||
| 34 | return canon < other.canon; | ||
| 35 | } | ||
| 36 | } else { | ||
| 37 | return !terminating && other.terminating; | ||
| 38 | } | ||
| 39 | } | 67 | } |
| 40 | }; | 68 | }; |
| 41 | 69 | ||
| 42 | enum querytype { | 70 | enum class querytype { |
| 43 | querytype_literal, | 71 | literal, |
| 44 | querytype_sentence | 72 | sentence |
| 45 | }; | 73 | }; |
| 46 | 74 | ||
| 47 | struct query { | 75 | struct query { |
| 48 | querytype type; | 76 | querytype type; |
| 49 | token word; | 77 | token tok; |
| 50 | 78 | ||
| 51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
| 52 | 80 | ||
| 53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
| 54 | 82 | ||
| 55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
| 56 | { | 84 | { |
| 57 | if (type == other.type) | 85 | if (type == other.type) |
| 58 | { | 86 | { |
| 59 | return word < other.word; | 87 | return tok < other.tok; |
| 60 | } else { | 88 | } else { |
| 61 | return type < other.type; | 89 | return type < other.type; |
| 62 | } | 90 | } |
| @@ -65,34 +93,11 @@ struct query { | |||
| 65 | 93 | ||
| 66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
| 67 | 95 | ||
| 68 | struct termstats { | ||
| 69 | char terminator; | ||
| 70 | int occurrences; | ||
| 71 | |||
| 72 | termstats() : terminator('.'), occurrences(1) {} | ||
| 73 | |||
| 74 | termstats(char terminator, int occurrences) | ||
| 75 | { | ||
| 76 | this->terminator = terminator; | ||
| 77 | this->occurrences = occurrences; | ||
| 78 | } | ||
| 79 | |||
| 80 | bool operator<(const termstats& other) const | ||
| 81 | { | ||
| 82 | if (terminator == other.terminator) | ||
| 83 | { | ||
| 84 | return occurrences < other.occurrences; | ||
| 85 | } else { | ||
| 86 | return terminator < other.terminator; | ||
| 87 | } | ||
| 88 | } | ||
| 89 | }; | ||
| 90 | |||
| 91 | class kgramstats | 96 | class kgramstats |
| 92 | { | 97 | { |
| 93 | public: | 98 | public: |
| 94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
| 95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
| 96 | 101 | ||
| 97 | private: | 102 | private: |
| 98 | struct token_data | 103 | struct token_data |
| @@ -100,16 +105,15 @@ private: | |||
| 100 | int all; | 105 | int all; |
| 101 | int titlecase; | 106 | int titlecase; |
| 102 | int uppercase; | 107 | int uppercase; |
| 103 | token word; | 108 | token tok; |
| 104 | 109 | ||
| 105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
| 106 | }; | 111 | }; |
| 107 | 112 | ||
| 108 | int maxK; | 113 | int maxK; |
| 109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
| 110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
| 111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
| 112 | std::vector<std::string> hashtags; | ||
| 113 | }; | 117 | }; |
| 114 | 118 | ||
| 115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |
