diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-08-26 22:13:50 -0400 |
| commit | d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch) | |
| tree | 013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.h | |
| parent | 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff) | |
| download | rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip | |
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.h')
| -rw-r--r-- | kgramstats.h | 23 |
1 files changed, 14 insertions, 9 deletions
| diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <list> | 6 | #include <list> |
| 7 | #include <vector> | 7 | #include <vector> |
| 8 | #include "histogram.h" | 8 | #include "histogram.h" |
| 9 | #include "identifier.h" | ||
| 9 | #include <functional> | 10 | #include <functional> |
| 10 | #include <set> | 11 | #include <set> |
| 11 | 12 | ||
| @@ -92,6 +93,9 @@ class rawr { | |||
| 92 | } | 93 | } |
| 93 | }; | 94 | }; |
| 94 | 95 | ||
| 96 | using tokenstore = identifier<token>; | ||
| 97 | using token_id = tokenstore::key_type; | ||
| 98 | |||
| 95 | enum class querytype { | 99 | enum class querytype { |
| 96 | literal, | 100 | literal, |
| 97 | sentence | 101 | sentence |
| @@ -99,12 +103,12 @@ class rawr { | |||
| 99 | 103 | ||
| 100 | struct query { | 104 | struct query { |
| 101 | querytype type; | 105 | querytype type; |
| 102 | token tok; | 106 | token_id tok; |
| 103 | 107 | ||
| 104 | query(token tok) : tok(tok), type(querytype::literal) {} | 108 | query(token_id tok) : tok(tok), type(querytype::literal) {} |
| 105 | 109 | ||
| 106 | query(querytype type) : tok(blank_word), type(type) {} | 110 | query(querytype type) : tok(0), type(type) {} |
| 107 | 111 | ||
| 108 | bool operator<(const query& other) const | 112 | bool operator<(const query& other) const |
| 109 | { | 113 | { |
| 110 | if (type == other.type) | 114 | if (type == other.type) |
| @@ -126,10 +130,10 @@ class rawr { | |||
| 126 | int all; | 130 | int all; |
| 127 | int titlecase; | 131 | int titlecase; |
| 128 | int uppercase; | 132 | int uppercase; |
| 129 | token tok; | 133 | token_id tok; |
| 130 | std::set<int> corpora; | 134 | std::set<int> corpora; |
| 131 | 135 | ||
| 132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 136 | token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
| 133 | }; | 137 | }; |
| 134 | 138 | ||
| 135 | friend std::ostream& operator<<(std::ostream& os, kgram k); | 139 | friend std::ostream& operator<<(std::ostream& os, kgram k); |
| @@ -140,6 +144,7 @@ class rawr { | |||
| 140 | int _maxK; | 144 | int _maxK; |
| 141 | bool _compiled = false; | 145 | bool _compiled = false; |
| 142 | std::vector<std::string> _corpora; | 146 | std::vector<std::string> _corpora; |
| 147 | tokenstore _tokenstore; | ||
| 143 | std::map<kgram, std::map<int, token_data>> _stats; | 148 | std::map<kgram, std::map<int, token_data>> _stats; |
| 144 | transform_callback _transform; | 149 | transform_callback _transform; |
| 145 | int _min_corpora = 1; | 150 | int _min_corpora = 1; |
