diff options
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 124 |
1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -2,61 +2,89 @@ | |||
2 | #include <map> | 2 | #include <map> |
3 | #include <list> | 3 | #include <list> |
4 | #include <vector> | 4 | #include <vector> |
5 | #include "malaprop.h" | 5 | #include "histogram.h" |
6 | 6 | ||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | enum tokentype { | 10 | struct word { |
11 | tokentype_literal, | 11 | std::string canon; |
12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
13 | histogram<std::string> terms; | ||
14 | |||
15 | word(std::string canon) : canon(canon) {} | ||
16 | |||
17 | bool operator<(const word& other) const | ||
18 | { | ||
19 | return canon < other.canon; | ||
20 | } | ||
13 | }; | 21 | }; |
14 | 22 | ||
15 | struct token { | 23 | extern word blank_word; |
16 | tokentype type; | 24 | |
17 | std::string canon; | 25 | enum class suffixtype { |
18 | bool terminating; | 26 | none, |
27 | terminating, | ||
28 | comma | ||
29 | }; | ||
30 | |||
31 | enum class parentype { | ||
32 | paren, | ||
33 | square_bracket, | ||
34 | asterisk, | ||
35 | quote | ||
36 | }; | ||
37 | |||
38 | enum class doublestatus { | ||
39 | opening, | ||
40 | closing, | ||
41 | both | ||
42 | }; | ||
43 | |||
44 | struct delimiter { | ||
45 | parentype type; | ||
46 | doublestatus status; | ||
47 | |||
48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
19 | 49 | ||
20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
53 | } | ||
54 | }; | ||
55 | |||
56 | struct token { | ||
57 | const word& w; | ||
58 | std::map<delimiter, int> delimiters; | ||
59 | suffixtype suffix; | ||
60 | std::string raw; | ||
61 | |||
62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
22 | 63 | ||
23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
24 | { | 65 | { |
25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
26 | { | ||
27 | return type < other.type; | ||
28 | } else if (type == tokentype_literal) | ||
29 | { | ||
30 | if (canon == other.canon) | ||
31 | { | ||
32 | return !terminating && other.terminating; | ||
33 | } else { | ||
34 | return canon < other.canon; | ||
35 | } | ||
36 | } else { | ||
37 | return !terminating && other.terminating; | ||
38 | } | ||
39 | } | 67 | } |
40 | }; | 68 | }; |
41 | 69 | ||
42 | enum querytype { | 70 | enum class querytype { |
43 | querytype_literal, | 71 | literal, |
44 | querytype_sentence | 72 | sentence |
45 | }; | 73 | }; |
46 | 74 | ||
47 | struct query { | 75 | struct query { |
48 | querytype type; | 76 | querytype type; |
49 | token word; | 77 | token tok; |
50 | 78 | ||
51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
52 | 80 | ||
53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
54 | 82 | ||
55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
56 | { | 84 | { |
57 | if (type == other.type) | 85 | if (type == other.type) |
58 | { | 86 | { |
59 | return word < other.word; | 87 | return tok < other.tok; |
60 | } else { | 88 | } else { |
61 | return type < other.type; | 89 | return type < other.type; |
62 | } | 90 | } |
@@ -65,34 +93,11 @@ struct query { | |||
65 | 93 | ||
66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
67 | 95 | ||
68 | struct termstats { | ||
69 | char terminator; | ||
70 | int occurrences; | ||
71 | |||
72 | termstats() : terminator('.'), occurrences(1) {} | ||
73 | |||
74 | termstats(char terminator, int occurrences) | ||
75 | { | ||
76 | this->terminator = terminator; | ||
77 | this->occurrences = occurrences; | ||
78 | } | ||
79 | |||
80 | bool operator<(const termstats& other) const | ||
81 | { | ||
82 | if (terminator == other.terminator) | ||
83 | { | ||
84 | return occurrences < other.occurrences; | ||
85 | } else { | ||
86 | return terminator < other.terminator; | ||
87 | } | ||
88 | } | ||
89 | }; | ||
90 | |||
91 | class kgramstats | 96 | class kgramstats |
92 | { | 97 | { |
93 | public: | 98 | public: |
94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
96 | 101 | ||
97 | private: | 102 | private: |
98 | struct token_data | 103 | struct token_data |
@@ -100,16 +105,15 @@ private: | |||
100 | int all; | 105 | int all; |
101 | int titlecase; | 106 | int titlecase; |
102 | int uppercase; | 107 | int uppercase; |
103 | token word; | 108 | token tok; |
104 | 109 | ||
105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
106 | }; | 111 | }; |
107 | 112 | ||
108 | int maxK; | 113 | int maxK; |
109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
112 | std::vector<std::string> hashtags; | ||
113 | }; | 117 | }; |
114 | 118 | ||
115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |