diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-20 23:14:06 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-20 23:15:10 -0400 |
commit | 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch) | |
tree | 0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.h | |
parent | a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff) | |
download | rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2 rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip |
Pulled the ebooks functionality out into a library
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 201 |
1 files changed, 106 insertions, 95 deletions
diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -1,124 +1,135 @@ | |||
1 | #ifndef KGRAMSTATS_H | ||
2 | #define KGRAMSTATS_H | ||
3 | |||
1 | #include <string> | 4 | #include <string> |
2 | #include <map> | 5 | #include <map> |
3 | #include <list> | 6 | #include <list> |
4 | #include <vector> | 7 | #include <vector> |
5 | #include "histogram.h" | 8 | #include "histogram.h" |
9 | #include <functional> | ||
6 | 10 | ||
7 | #ifndef KGRAMSTATS_H | 11 | class rawr { |
8 | #define KGRAMSTATS_H | 12 | public: |
9 | 13 | typedef std::function<std::string(std::string, std::string)> transform_callback; | |
10 | struct word { | 14 | |
11 | std::string canon; | 15 | void addCorpus(std::string corpus); |
12 | histogram<std::string> forms; | 16 | void compile(int maxK); |
13 | histogram<std::string> terms; | 17 | |
18 | void setTransformCallback(transform_callback _arg); | ||
19 | std::string randomSentence(int maxL); | ||
20 | |||
21 | private: | ||
22 | struct word { | ||
23 | std::string canon; | ||
24 | histogram<std::string> forms; | ||
25 | histogram<std::string> terms; | ||
14 | 26 | ||
15 | word(std::string canon) : canon(canon) {} | 27 | word(std::string canon) : canon(canon) {} |
16 | 28 | ||
17 | bool operator<(const word& other) const | 29 | bool operator<(const word& other) const |
18 | { | 30 | { |
19 | return canon < other.canon; | 31 | return canon < other.canon; |
20 | } | 32 | } |
21 | }; | 33 | }; |
22 | |||
23 | extern word blank_word; | ||
24 | 34 | ||
25 | enum class suffixtype { | 35 | enum class suffixtype { |
26 | none, | 36 | none, |
27 | terminating, | 37 | terminating, |
28 | comma | 38 | comma |
29 | }; | 39 | }; |
30 | 40 | ||
31 | enum class parentype { | 41 | enum class parentype { |
32 | paren, | 42 | paren, |
33 | square_bracket, | 43 | square_bracket, |
34 | asterisk, | 44 | asterisk, |
35 | quote | 45 | quote |
36 | }; | 46 | }; |
37 | 47 | ||
38 | enum class doublestatus { | 48 | enum class doublestatus { |
39 | opening, | 49 | opening, |
40 | closing, | 50 | closing, |
41 | both | 51 | both |
42 | }; | 52 | }; |
43 | 53 | ||
44 | struct delimiter { | 54 | struct delimiter { |
45 | parentype type; | 55 | parentype type; |
46 | doublestatus status; | 56 | doublestatus status; |
47 | 57 | ||
48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | 58 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} |
49 | 59 | ||
50 | bool operator<(const delimiter& other) const | 60 | bool operator<(const delimiter& other) const |
51 | { | 61 | { |
52 | return std::tie(type, status) < std::tie(other.type, other.status); | 62 | return std::tie(type, status) < std::tie(other.type, other.status); |
53 | } | 63 | } |
54 | }; | 64 | }; |
55 | 65 | ||
56 | struct token { | 66 | struct token { |
57 | const word& w; | 67 | const word& w; |
58 | std::map<delimiter, int> delimiters; | 68 | std::map<delimiter, int> delimiters; |
59 | suffixtype suffix; | 69 | suffixtype suffix; |
60 | std::string raw; | 70 | std::string raw; |
61 | 71 | ||
62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | 72 | token(const word& w) : w(w), suffix(suffixtype::none) {} |
63 | 73 | ||
64 | bool operator<(const token& other) const | 74 | bool operator<(const token& other) const |
65 | { | 75 | { |
66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); | 76 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
67 | } | 77 | } |
68 | }; | 78 | }; |
69 | 79 | ||
70 | enum class querytype { | 80 | enum class querytype { |
71 | literal, | 81 | literal, |
72 | sentence | 82 | sentence |
73 | }; | 83 | }; |
74 | 84 | ||
75 | struct query { | 85 | struct query { |
76 | querytype type; | 86 | querytype type; |
77 | token tok; | 87 | token tok; |
78 | 88 | ||
79 | query(token tok) : tok(tok), type(querytype::literal) {} | 89 | query(token tok) : tok(tok), type(querytype::literal) {} |
80 | 90 | ||
81 | query(querytype type) : tok(blank_word), type(type) {} | 91 | query(querytype type) : tok(blank_word), type(type) {} |
82 | 92 | ||
83 | bool operator<(const query& other) const | 93 | bool operator<(const query& other) const |
84 | { | 94 | { |
85 | if (type == other.type) | 95 | if (type == other.type) |
86 | { | 96 | { |
87 | return tok < other.tok; | 97 | return tok < other.tok; |
88 | } else { | 98 | } else { |
89 | return type < other.type; | 99 | return type < other.type; |
90 | } | 100 | } |
91 | } | 101 | } |
92 | }; | 102 | }; |
93 | 103 | ||
94 | typedef std::list<query> kgram; | 104 | static const query wildcardQuery; |
105 | static const word blank_word; | ||
95 | 106 | ||
96 | class kgramstats | 107 | typedef std::list<query> kgram; |
97 | { | ||
98 | public: | ||
99 | kgramstats(std::string corpus, int maxK); | ||
100 | std::string randomSentence(int maxL); | ||
101 | |||
102 | private: | ||
103 | struct token_data | ||
104 | { | ||
105 | int all; | ||
106 | int titlecase; | ||
107 | int uppercase; | ||
108 | token tok; | ||
109 | 108 | ||
110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 109 | struct token_data |
111 | }; | 110 | { |
111 | int all; | ||
112 | int titlecase; | ||
113 | int uppercase; | ||
114 | token tok; | ||
115 | |||
116 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | ||
117 | }; | ||
118 | |||
119 | friend std::ostream& operator<<(std::ostream& os, kgram k); | ||
120 | friend std::ostream& operator<<(std::ostream& os, query q); | ||
121 | friend std::ostream& operator<<(std::ostream& os, token t); | ||
112 | 122 | ||
113 | int maxK; | 123 | int _maxK; |
114 | std::map<kgram, std::map<int, token_data> > stats; | 124 | bool _compiled = false; |
125 | std::vector<std::string> _corpora; | ||
126 | std::map<kgram, std::map<int, token_data>> _stats; | ||
127 | transform_callback _transform; | ||
115 | 128 | ||
116 | // Words | 129 | // Words |
117 | std::map<std::string, word> words; | 130 | std::map<std::string, word> words; |
118 | word hashtags {"#hashtag"}; | 131 | word hashtags {"#hashtag"}; |
119 | word emoticons {"👌"}; | 132 | word emoticons {"👌"}; |
120 | }; | 133 | }; |
121 | 134 | ||
122 | void printKgram(kgram k); | ||
123 | |||
124 | #endif \ No newline at end of file | 135 | #endif \ No newline at end of file |