diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
commit | d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch) | |
tree | 013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.h | |
parent | 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff) | |
download | rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip |
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 23 |
1 files changed, 14 insertions, 9 deletions
diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <list> | 6 | #include <list> |
7 | #include <vector> | 7 | #include <vector> |
8 | #include "histogram.h" | 8 | #include "histogram.h" |
9 | #include "identifier.h" | ||
9 | #include <functional> | 10 | #include <functional> |
10 | #include <set> | 11 | #include <set> |
11 | 12 | ||
@@ -92,6 +93,9 @@ class rawr { | |||
92 | } | 93 | } |
93 | }; | 94 | }; |
94 | 95 | ||
96 | using tokenstore = identifier<token>; | ||
97 | using token_id = tokenstore::key_type; | ||
98 | |||
95 | enum class querytype { | 99 | enum class querytype { |
96 | literal, | 100 | literal, |
97 | sentence | 101 | sentence |
@@ -99,12 +103,12 @@ class rawr { | |||
99 | 103 | ||
100 | struct query { | 104 | struct query { |
101 | querytype type; | 105 | querytype type; |
102 | token tok; | 106 | token_id tok; |
103 | 107 | ||
104 | query(token tok) : tok(tok), type(querytype::literal) {} | 108 | query(token_id tok) : tok(tok), type(querytype::literal) {} |
105 | 109 | ||
106 | query(querytype type) : tok(blank_word), type(type) {} | 110 | query(querytype type) : tok(0), type(type) {} |
107 | 111 | ||
108 | bool operator<(const query& other) const | 112 | bool operator<(const query& other) const |
109 | { | 113 | { |
110 | if (type == other.type) | 114 | if (type == other.type) |
@@ -126,10 +130,10 @@ class rawr { | |||
126 | int all; | 130 | int all; |
127 | int titlecase; | 131 | int titlecase; |
128 | int uppercase; | 132 | int uppercase; |
129 | token tok; | 133 | token_id tok; |
130 | std::set<int> corpora; | 134 | std::set<int> corpora; |
131 | 135 | ||
132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 136 | token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
133 | }; | 137 | }; |
134 | 138 | ||
135 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 139 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
@@ -140,6 +144,7 @@ class rawr { | |||
140 | int _maxK; | 144 | int _maxK; |
141 | bool _compiled = false; | 145 | bool _compiled = false; |
142 | std::vector<std::string> _corpora; | 146 | std::vector<std::string> _corpora; |
147 | tokenstore _tokenstore; | ||
143 | std::map<kgram, std::map<int, token_data>> _stats; | 148 | std::map<kgram, std::map<int, token_data>> _stats; |
144 | transform_callback _transform; | 149 | transform_callback _transform; |
145 | int _min_corpora = 1; | 150 | int _min_corpora = 1; |