about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:14:06 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-05-20 23:15:10 -0400
commit8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.h
parenta9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
downloadrawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip
Pulled the ebooks functionality out into a library
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h201
1 files changed, 106 insertions, 95 deletions
diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -1,124 +1,135 @@
1#ifndef KGRAMSTATS_H
2#define KGRAMSTATS_H
3
1#include <string> 4#include <string>
2#include <map> 5#include <map>
3#include <list> 6#include <list>
4#include <vector> 7#include <vector>
5#include "histogram.h" 8#include "histogram.h"
9#include <functional>
6 10
7#ifndef KGRAMSTATS_H 11class rawr {
8#define KGRAMSTATS_H 12 public:
9 13 typedef std::function<std::string(std::string, std::string)> transform_callback;
10struct word { 14
11 std::string canon; 15 void addCorpus(std::string corpus);
12 histogram<std::string> forms; 16 void compile(int maxK);
13 histogram<std::string> terms; 17
18 void setTransformCallback(transform_callback _arg);
19 std::string randomSentence(int maxL);
20
21 private:
22 struct word {
23 std::string canon;
24 histogram<std::string> forms;
25 histogram<std::string> terms;
14 26
15 word(std::string canon) : canon(canon) {} 27 word(std::string canon) : canon(canon) {}
16 28
17 bool operator<(const word& other) const 29 bool operator<(const word& other) const
18 { 30 {
19 return canon < other.canon; 31 return canon < other.canon;
20 } 32 }
21}; 33 };
22
23extern word blank_word;
24 34
25enum class suffixtype { 35 enum class suffixtype {
26 none, 36 none,
27 terminating, 37 terminating,
28 comma 38 comma
29}; 39 };
30 40
31enum class parentype { 41 enum class parentype {
32 paren, 42 paren,
33 square_bracket, 43 square_bracket,
34 asterisk, 44 asterisk,
35 quote 45 quote
36}; 46 };
37 47
38enum class doublestatus { 48 enum class doublestatus {
39 opening, 49 opening,
40 closing, 50 closing,
41 both 51 both
42}; 52 };
43 53
44struct delimiter { 54 struct delimiter {
45 parentype type; 55 parentype type;
46 doublestatus status; 56 doublestatus status;
47 57
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {} 58 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
49 59
50 bool operator<(const delimiter& other) const 60 bool operator<(const delimiter& other) const
51 { 61 {
52 return std::tie(type, status) < std::tie(other.type, other.status); 62 return std::tie(type, status) < std::tie(other.type, other.status);
53 } 63 }
54}; 64 };
55 65
56struct token { 66 struct token {
57 const word& w; 67 const word& w;
58 std::map<delimiter, int> delimiters; 68 std::map<delimiter, int> delimiters;
59 suffixtype suffix; 69 suffixtype suffix;
60 std::string raw; 70 std::string raw;
61 71
62 token(const word& w) : w(w), suffix(suffixtype::none) {} 72 token(const word& w) : w(w), suffix(suffixtype::none) {}
63 73
64 bool operator<(const token& other) const 74 bool operator<(const token& other) const
65 { 75 {
66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix); 76 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
67 } 77 }
68}; 78 };
69 79
70enum class querytype { 80 enum class querytype {
71 literal, 81 literal,
72 sentence 82 sentence
73}; 83 };
74 84
75struct query { 85 struct query {
76 querytype type; 86 querytype type;
77 token tok; 87 token tok;
78 88
79 query(token tok) : tok(tok), type(querytype::literal) {} 89 query(token tok) : tok(tok), type(querytype::literal) {}
80 90
81 query(querytype type) : tok(blank_word), type(type) {} 91 query(querytype type) : tok(blank_word), type(type) {}
82 92
83 bool operator<(const query& other) const 93 bool operator<(const query& other) const
84 { 94 {
85 if (type == other.type) 95 if (type == other.type)
86 { 96 {
87 return tok < other.tok; 97 return tok < other.tok;
88 } else { 98 } else {
89 return type < other.type; 99 return type < other.type;
90 } 100 }
91 } 101 }
92}; 102 };
93 103
94typedef std::list<query> kgram; 104 static const query wildcardQuery;
105 static const word blank_word;
95 106
96class kgramstats 107 typedef std::list<query> kgram;
97{
98public:
99 kgramstats(std::string corpus, int maxK);
100 std::string randomSentence(int maxL);
101
102private:
103 struct token_data
104 {
105 int all;
106 int titlecase;
107 int uppercase;
108 token tok;
109 108
110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 109 struct token_data
111 }; 110 {
111 int all;
112 int titlecase;
113 int uppercase;
114 token tok;
115
116 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
117 };
118
119 friend std::ostream& operator<<(std::ostream& os, kgram k);
120 friend std::ostream& operator<<(std::ostream& os, query q);
121 friend std::ostream& operator<<(std::ostream& os, token t);
112 122
113 int maxK; 123 int _maxK;
114 std::map<kgram, std::map<int, token_data> > stats; 124 bool _compiled = false;
125 std::vector<std::string> _corpora;
126 std::map<kgram, std::map<int, token_data>> _stats;
127 transform_callback _transform;
115 128
116 // Words 129 // Words
117 std::map<std::string, word> words; 130 std::map<std::string, word> words;
118 word hashtags {"#hashtag"}; 131 word hashtags {"#hashtag"};
119 word emoticons {"👌"}; 132 word emoticons {"👌"};
120}; 133};
121 134
122void printKgram(kgram k);
123
124#endif \ No newline at end of file 135#endif \ No newline at end of file