From d75685e69f9a5d3cfc255aa921005fc40ae6e585 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sun, 26 Aug 2018 22:13:50 -0400 Subject: Interned tokens to reduce memory footprint --- kgramstats.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'kgramstats.h') diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -6,6 +6,7 @@ #include #include #include "histogram.h" +#include "identifier.h" #include #include @@ -92,6 +93,9 @@ class rawr { } }; + using tokenstore = identifier; + using token_id = tokenstore::key_type; + enum class querytype { literal, sentence @@ -99,12 +103,12 @@ class rawr { struct query { querytype type; - token tok; - - query(token tok) : tok(tok), type(querytype::literal) {} - - query(querytype type) : tok(blank_word), type(type) {} - + token_id tok; + + query(token_id tok) : tok(tok), type(querytype::literal) {} + + query(querytype type) : tok(0), type(type) {} + bool operator<(const query& other) const { if (type == other.type) @@ -126,10 +130,10 @@ class rawr { int all; int titlecase; int uppercase; - token tok; + token_id tok; std::set corpora; - - token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} + + token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} }; friend std::ostream& operator<<(std::ostream& os, kgram k); @@ -140,6 +144,7 @@ class rawr { int _maxK; bool _compiled = false; std::vector _corpora; + tokenstore _tokenstore; std::map> _stats; transform_callback _transform; int _min_corpora = 1; -- cgit 1.4.1