about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2018-08-26 22:13:50 -0400
commitd75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.h
parent26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
downloadrawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip
Interned tokens to reduce memory footprint
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h23
1 files changed, 14 insertions, 9 deletions
diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -6,6 +6,7 @@
6#include <list> 6#include <list>
7#include <vector> 7#include <vector>
8#include "histogram.h" 8#include "histogram.h"
9#include "identifier.h"
9#include <functional> 10#include <functional>
10#include <set> 11#include <set>
11 12
@@ -92,6 +93,9 @@ class rawr {
92 } 93 }
93 }; 94 };
94 95
96 using tokenstore = identifier<token>;
97 using token_id = tokenstore::key_type;
98
95 enum class querytype { 99 enum class querytype {
96 literal, 100 literal,
97 sentence 101 sentence
@@ -99,12 +103,12 @@ class rawr {
99 103
100 struct query { 104 struct query {
101 querytype type; 105 querytype type;
102 token tok; 106 token_id tok;
103 107
104 query(token tok) : tok(tok), type(querytype::literal) {} 108 query(token_id tok) : tok(tok), type(querytype::literal) {}
105 109
106 query(querytype type) : tok(blank_word), type(type) {} 110 query(querytype type) : tok(0), type(type) {}
107 111
108 bool operator<(const query& other) const 112 bool operator<(const query& other) const
109 { 113 {
110 if (type == other.type) 114 if (type == other.type)
@@ -126,10 +130,10 @@ class rawr {
126 int all; 130 int all;
127 int titlecase; 131 int titlecase;
128 int uppercase; 132 int uppercase;
129 token tok; 133 token_id tok;
130 std::set<int> corpora; 134 std::set<int> corpora;
131 135
132 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} 136 token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
133 }; 137 };
134 138
135 friend std::ostream& operator<<(std::ostream& os, kgram k); 139 friend std::ostream& operator<<(std::ostream& os, kgram k);
@@ -140,6 +144,7 @@ class rawr {
140 int _maxK; 144 int _maxK;
141 bool _compiled = false; 145 bool _compiled = false;
142 std::vector<std::string> _corpora; 146 std::vector<std::string> _corpora;
147 tokenstore _tokenstore;
143 std::map<kgram, std::map<int, token_data>> _stats; 148 std::map<kgram, std::map<int, token_data>> _stats;
144 transform_callback _transform; 149 transform_callback _transform;
145 int _min_corpora = 1; 150 int _min_corpora = 1;