diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-20 23:14:06 -0400 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-05-20 23:15:10 -0400 |
| commit | 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch) | |
| tree | 0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.h | |
| parent | a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff) | |
| download | rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2 rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip | |
Pulled the ebooks functionality out into a library
Diffstat (limited to 'kgramstats.h')
| -rw-r--r-- | kgramstats.h | 201 |
1 files changed, 106 insertions, 95 deletions
| diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -1,124 +1,135 @@ | |||
| 1 | #ifndef KGRAMSTATS_H | ||
| 2 | #define KGRAMSTATS_H | ||
| 3 | |||
| 1 | #include <string> | 4 | #include <string> |
| 2 | #include <map> | 5 | #include <map> |
| 3 | #include <list> | 6 | #include <list> |
| 4 | #include <vector> | 7 | #include <vector> |
| 5 | #include "histogram.h" | 8 | #include "histogram.h" |
| 9 | #include <functional> | ||
| 6 | 10 | ||
| 7 | #ifndef KGRAMSTATS_H | 11 | class rawr { |
| 8 | #define KGRAMSTATS_H | 12 | public: |
| 9 | 13 | typedef std::function<std::string(std::string, std::string)> transform_callback; | |
| 10 | struct word { | 14 | |
| 11 | std::string canon; | 15 | void addCorpus(std::string corpus); |
| 12 | histogram<std::string> forms; | 16 | void compile(int maxK); |
| 13 | histogram<std::string> terms; | 17 | |
| 18 | void setTransformCallback(transform_callback _arg); | ||
| 19 | std::string randomSentence(int maxL); | ||
| 20 | |||
| 21 | private: | ||
| 22 | struct word { | ||
| 23 | std::string canon; | ||
| 24 | histogram<std::string> forms; | ||
| 25 | histogram<std::string> terms; | ||
| 14 | 26 | ||
| 15 | word(std::string canon) : canon(canon) {} | 27 | word(std::string canon) : canon(canon) {} |
| 16 | 28 | ||
| 17 | bool operator<(const word& other) const | 29 | bool operator<(const word& other) const |
| 18 | { | 30 | { |
| 19 | return canon < other.canon; | 31 | return canon < other.canon; |
| 20 | } | 32 | } |
| 21 | }; | 33 | }; |
| 22 | |||
| 23 | extern word blank_word; | ||
| 24 | 34 | ||
| 25 | enum class suffixtype { | 35 | enum class suffixtype { |
| 26 | none, | 36 | none, |
| 27 | terminating, | 37 | terminating, |
| 28 | comma | 38 | comma |
| 29 | }; | 39 | }; |
| 30 | 40 | ||
| 31 | enum class parentype { | 41 | enum class parentype { |
| 32 | paren, | 42 | paren, |
| 33 | square_bracket, | 43 | square_bracket, |
| 34 | asterisk, | 44 | asterisk, |
| 35 | quote | 45 | quote |
| 36 | }; | 46 | }; |
| 37 | 47 | ||
| 38 | enum class doublestatus { | 48 | enum class doublestatus { |
| 39 | opening, | 49 | opening, |
| 40 | closing, | 50 | closing, |
| 41 | both | 51 | both |
| 42 | }; | 52 | }; |
| 43 | 53 | ||
| 44 | struct delimiter { | 54 | struct delimiter { |
| 45 | parentype type; | 55 | parentype type; |
| 46 | doublestatus status; | 56 | doublestatus status; |
| 47 | 57 | ||
| 48 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} | 58 | delimiter(parentype type, doublestatus status) : type(type), status(status) {} |
| 49 | 59 | ||
| 50 | bool operator<(const delimiter& other) const | 60 | bool operator<(const delimiter& other) const |
| 51 | { | 61 | { |
| 52 | return std::tie(type, status) < std::tie(other.type, other.status); | 62 | return std::tie(type, status) < std::tie(other.type, other.status); |
| 53 | } | 63 | } |
| 54 | }; | 64 | }; |
| 55 | 65 | ||
| 56 | struct token { | 66 | struct token { |
| 57 | const word& w; | 67 | const word& w; |
| 58 | std::map<delimiter, int> delimiters; | 68 | std::map<delimiter, int> delimiters; |
| 59 | suffixtype suffix; | 69 | suffixtype suffix; |
| 60 | std::string raw; | 70 | std::string raw; |
| 61 | 71 | ||
| 62 | token(const word& w) : w(w), suffix(suffixtype::none) {} | 72 | token(const word& w) : w(w), suffix(suffixtype::none) {} |
| 63 | 73 | ||
| 64 | bool operator<(const token& other) const | 74 | bool operator<(const token& other) const |
| 65 | { | 75 | { |
| 66 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); | 76 | return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); |
| 67 | } | 77 | } |
| 68 | }; | 78 | }; |
| 69 | 79 | ||
| 70 | enum class querytype { | 80 | enum class querytype { |
| 71 | literal, | 81 | literal, |
| 72 | sentence | 82 | sentence |
| 73 | }; | 83 | }; |
| 74 | 84 | ||
| 75 | struct query { | 85 | struct query { |
| 76 | querytype type; | 86 | querytype type; |
| 77 | token tok; | 87 | token tok; |
| 78 | 88 | ||
| 79 | query(token tok) : tok(tok), type(querytype::literal) {} | 89 | query(token tok) : tok(tok), type(querytype::literal) {} |
| 80 | 90 | ||
| 81 | query(querytype type) : tok(blank_word), type(type) {} | 91 | query(querytype type) : tok(blank_word), type(type) {} |
| 82 | 92 | ||
| 83 | bool operator<(const query& other) const | 93 | bool operator<(const query& other) const |
| 84 | { | 94 | { |
| 85 | if (type == other.type) | 95 | if (type == other.type) |
| 86 | { | 96 | { |
| 87 | return tok < other.tok; | 97 | return tok < other.tok; |
| 88 | } else { | 98 | } else { |
| 89 | return type < other.type; | 99 | return type < other.type; |
| 90 | } | 100 | } |
| 91 | } | 101 | } |
| 92 | }; | 102 | }; |
| 93 | 103 | ||
| 94 | typedef std::list<query> kgram; | 104 | static const query wildcardQuery; |
| 105 | static const word blank_word; | ||
| 95 | 106 | ||
| 96 | class kgramstats | 107 | typedef std::list<query> kgram; |
| 97 | { | ||
| 98 | public: | ||
| 99 | kgramstats(std::string corpus, int maxK); | ||
| 100 | std::string randomSentence(int maxL); | ||
| 101 | |||
| 102 | private: | ||
| 103 | struct token_data | ||
| 104 | { | ||
| 105 | int all; | ||
| 106 | int titlecase; | ||
| 107 | int uppercase; | ||
| 108 | token tok; | ||
| 109 | 108 | ||
| 110 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 109 | struct token_data |
| 111 | }; | 110 | { |
| 111 | int all; | ||
| 112 | int titlecase; | ||
| 113 | int uppercase; | ||
| 114 | token tok; | ||
| 115 | |||
| 116 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | ||
| 117 | }; | ||
| 118 | |||
| 119 | friend std::ostream& operator<<(std::ostream& os, kgram k); | ||
| 120 | friend std::ostream& operator<<(std::ostream& os, query q); | ||
| 121 | friend std::ostream& operator<<(std::ostream& os, token t); | ||
| 112 | 122 | ||
| 113 | int maxK; | 123 | int _maxK; |
| 114 | std::map<kgram, std::map<int, token_data> > stats; | 124 | bool _compiled = false; |
| 125 | std::vector<std::string> _corpora; | ||
| 126 | std::map<kgram, std::map<int, token_data>> _stats; | ||
| 127 | transform_callback _transform; | ||
| 115 | 128 | ||
| 116 | // Words | 129 | // Words |
| 117 | std::map<std::string, word> words; | 130 | std::map<std::string, word> words; |
| 118 | word hashtags {"#hashtag"}; | 131 | word hashtags {"#hashtag"}; |
| 119 | word emoticons {"👌"}; | 132 | word emoticons {"👌"}; |
| 120 | }; | 133 | }; |
| 121 | 134 | ||
| 122 | void printKgram(kgram k); | ||
| 123 | |||
| 124 | #endif \ No newline at end of file | 135 | #endif \ No newline at end of file |
