diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-29 12:43:00 -0500 |
commit | b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch) | |
tree | f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.h | |
parent | fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff) | |
download | rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip |
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 124 |
1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -2,61 +2,89 @@ | |||
2 | #include <map> | 2 | #include <map> |
3 | #include <list> | 3 | #include <list> |
4 | #include <vector> | 4 | #include <vector> |
5 | #include "malaprop.h" | 5 | #include "histogram.h" |
6 | 6 | ||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | enum tokentype { | 10 | struct word { |
11 | tokentype_literal, | 11 | std::string canon; |
12 | tokentype_hashtag | 12 | histogram<std::string> forms; |
13 | histogram<std::string> terms; | ||
14 | |||
15 | word(std::string canon) : canon(canon) {} | ||
16 | |||
17 | bool operator<(const word& other) const | ||
18 | { | ||
19 | return canon < other.canon; | ||
20 | } | ||
13 | }; | 21 | }; |
14 | 22 | ||
15 | struct token { | 23 | extern word blank_word; |
16 | tokentype type; | 24 | |
17 | std::string canon; | 25 | enum class suffixtype { |
18 | bool terminating; | 26 | none, |
27 | terminating, | ||
28 | comma | ||
29 | }; | ||
30 | |||
31 | enum class parentype { | ||
32 | paren, | ||
33 | square_bracket, | ||
34 | asterisk, | ||
35 | quote | ||
36 | }; | ||
37 | |||
38 | enum class doublestatus { | ||
39 | opening, | ||
40 | closing, | ||
41 | both | ||
42 | }; | ||
43 | |||
44 | struct delimiter { | ||
45 | parentype type; | ||
46 | doublestatus status; | ||
47 | |||
48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | ||
19 | 49 | ||
20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} | 50 | bool operator<(const delimiter& other) const |
21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | 51 | { |
52 | return std::tie(type, status) < std::tie(other.type, other.status); | ||
53 | } | ||
54 | }; | ||
55 | |||
56 | struct token { | ||
57 | const word& w; | ||
58 | std::map<delimiter, int> delimiters; | ||
59 | suffixtype suffix; | ||
60 | std::string raw; | ||
61 | |||
62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | ||
22 | 63 | ||
23 | bool operator<(const token& other) const | 64 | bool operator<(const token& other) const |
24 | { | 65 | { |
25 | if (type != other.type) | 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
26 | { | ||
27 | return type < other.type; | ||
28 | } else if (type == tokentype_literal) | ||
29 | { | ||
30 | if (canon == other.canon) | ||
31 | { | ||
32 | return !terminating && other.terminating; | ||
33 | } else { | ||
34 | return canon < other.canon; | ||
35 | } | ||
36 | } else { | ||
37 | return !terminating && other.terminating; | ||
38 | } | ||
39 | } | 67 | } |
40 | }; | 68 | }; |
41 | 69 | ||
42 | enum querytype { | 70 | enum class querytype { |
43 | querytype_literal, | 71 | literal, |
44 | querytype_sentence | 72 | sentence |
45 | }; | 73 | }; |
46 | 74 | ||
47 | struct query { | 75 | struct query { |
48 | querytype type; | 76 | querytype type; |
49 | token word; | 77 | token tok; |
50 | 78 | ||
51 | query(token word) : word(word), type(querytype_literal) {} | 79 | query(token tok) : tok(tok), type(querytype::literal) {} |
52 | 80 | ||
53 | query(querytype type) : word(""), type(type) {} | 81 | query(querytype type) : tok(blank_word), type(type) {} |
54 | 82 | ||
55 | bool operator<(const query& other) const | 83 | bool operator<(const query& other) const |
56 | { | 84 | { |
57 | if (type == other.type) | 85 | if (type == other.type) |
58 | { | 86 | { |
59 | return word < other.word; | 87 | return tok < other.tok; |
60 | } else { | 88 | } else { |
61 | return type < other.type; | 89 | return type < other.type; |
62 | } | 90 | } |
@@ -65,34 +93,11 @@ struct query { | |||
65 | 93 | ||
66 | typedef std::list<query> kgram; | 94 | typedef std::list<query> kgram; |
67 | 95 | ||
68 | struct termstats { | ||
69 | char terminator; | ||
70 | int occurrences; | ||
71 | |||
72 | termstats() : terminator('.'), occurrences(1) {} | ||
73 | |||
74 | termstats(char terminator, int occurrences) | ||
75 | { | ||
76 | this->terminator = terminator; | ||
77 | this->occurrences = occurrences; | ||
78 | } | ||
79 | |||
80 | bool operator<(const termstats& other) const | ||
81 | { | ||
82 | if (terminator == other.terminator) | ||
83 | { | ||
84 | return occurrences < other.occurrences; | ||
85 | } else { | ||
86 | return terminator < other.terminator; | ||
87 | } | ||
88 | } | ||
89 | }; | ||
90 | |||
91 | class kgramstats | 96 | class kgramstats |
92 | { | 97 | { |
93 | public: | 98 | public: |
94 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
95 | std::vector<std::string> randomSentence(int n); | 100 | std::string randomSentence(int n); |
96 | 101 | ||
97 | private: | 102 | private: |
98 | struct token_data | 103 | struct token_data |
@@ -100,16 +105,15 @@ private: | |||
100 | int all; | 105 | int all; |
101 | int titlecase; | 106 | int titlecase; |
102 | int uppercase; | 107 | int uppercase; |
103 | token word; | 108 | token tok; |
104 | 109 | ||
105 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} | 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
106 | }; | 111 | }; |
107 | 112 | ||
108 | int maxK; | 113 | int maxK; |
109 | std::map<kgram, std::map<int, token_data> > stats; | 114 | std::map<kgram, std::map<int, token_data> > stats; |
110 | malaprop mstats; | 115 | word hashtags {"#hashtag"}; |
111 | std::map<token, std::map<int, termstats> > endings; | 116 | std::map<std::string, word> words; |
112 | std::vector<std::string> hashtags; | ||
113 | }; | 117 | }; |
114 | 118 | ||
115 | void printKgram(kgram k); | 119 | void printKgram(kgram k); |