diff options
Diffstat (limited to 'kgramstats.h')
| -rw-r--r-- | kgramstats.h | 84 |
1 files changed, 73 insertions, 11 deletions
| diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -7,7 +7,71 @@ | |||
| 7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 9 | 9 | ||
| 10 | typedef std::list<std::string> kgram; | 10 | struct token { |
| 11 | std::string canon; | ||
| 12 | bool terminating; | ||
| 13 | |||
| 14 | token(std::string canon) : canon(canon), terminating(false) {} | ||
| 15 | |||
| 16 | bool operator<(const token& other) const | ||
| 17 | { | ||
| 18 | if (canon == other.canon) | ||
| 19 | { | ||
| 20 | return !terminating && other.terminating; | ||
| 21 | } else { | ||
| 22 | return canon < other.canon; | ||
| 23 | } | ||
| 24 | } | ||
| 25 | }; | ||
| 26 | |||
| 27 | enum querytype { | ||
| 28 | querytype_literal, | ||
| 29 | querytype_sentence | ||
| 30 | }; | ||
| 31 | |||
| 32 | struct query { | ||
| 33 | querytype type; | ||
| 34 | token word; | ||
| 35 | |||
| 36 | query(token word) : word(word), type(querytype_literal) {} | ||
| 37 | |||
| 38 | query(querytype type) : word(""), type(type) {} | ||
| 39 | |||
| 40 | bool operator<(const query& other) const | ||
| 41 | { | ||
| 42 | if (type == other.type) | ||
| 43 | { | ||
| 44 | return word < other.word; | ||
| 45 | } else { | ||
| 46 | return type < other.type; | ||
| 47 | } | ||
| 48 | } | ||
| 49 | }; | ||
| 50 | |||
| 51 | typedef std::list<query> kgram; | ||
| 52 | |||
| 53 | struct termstats { | ||
| 54 | char terminator; | ||
| 55 | int occurrences; | ||
| 56 | |||
| 57 | termstats() : terminator('.'), occurrences(1) {} | ||
| 58 | |||
| 59 | termstats(char terminator, int occurrences) | ||
| 60 | { | ||
| 61 | this->terminator = terminator; | ||
| 62 | this->occurrences = occurrences; | ||
| 63 | } | ||
| 64 | |||
| 65 | bool operator<(const termstats& other) const | ||
| 66 | { | ||
| 67 | if (terminator == other.terminator) | ||
| 68 | { | ||
| 69 | return occurrences < other.occurrences; | ||
| 70 | } else { | ||
| 71 | return terminator < other.terminator; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | }; | ||
| 11 | 75 | ||
| 12 | class kgramstats | 76 | class kgramstats |
| 13 | { | 77 | { |
| @@ -16,22 +80,20 @@ public: | |||
| 16 | std::vector<std::string> randomSentence(int n); | 80 | std::vector<std::string> randomSentence(int n); |
| 17 | 81 | ||
| 18 | private: | 82 | private: |
| 19 | typedef struct | 83 | struct token_data |
| 20 | { | 84 | { |
| 21 | int all; | 85 | int all; |
| 22 | int titlecase; | 86 | int titlecase; |
| 23 | int uppercase; | 87 | int uppercase; |
| 24 | int period; | 88 | token word; |
| 25 | int startquote; | 89 | |
| 26 | int endquote; | 90 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} |
| 27 | int startparen; | 91 | }; |
| 28 | int endparen; | 92 | |
| 29 | int comma; | ||
| 30 | std::string* token; | ||
| 31 | } token_data; | ||
| 32 | int maxK; | 93 | int maxK; |
| 33 | std::map<kgram, std::map<int, token_data*>* >* stats; | 94 | std::map<kgram, std::map<int, token_data> > stats; |
| 34 | malaprop mstats; | 95 | malaprop mstats; |
| 96 | std::map<token, std::map<int, termstats> > endings; | ||
| 35 | }; | 97 | }; |
| 36 | 98 | ||
| 37 | void printKgram(kgram k); | 99 | void printKgram(kgram k); |
