Interned tokens to reduce memory footprint

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2018-08-26 22:13:50 -0400
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2018-08-26 22:13:50 -0400
commit: d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree: 013285ad6ff9c7d2c2c3174eef99b89485917756
parent: 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
download: rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip
3 files changed, 133 insertions, 44 deletions
diff --git a/identifier.h b/identifier.h
new file mode 100644
index 0000000..74d83ce
--- /dev/null
+++ b/identifier.h

@@ -0,0 +1,59 @@
+#ifndef IDENTIFIER_H_D7EE5679
+#define IDENTIFIER_H_D7EE5679
+#include <map>
+#include <vector>
+template <typename T>
+class identifier {
+public:
+  using value_type = T;
+private:
+  using vector_type = std::vector<value_type>;
+public:
+  using key_type = typename vector_type::size_type;
+  key_type add(const value_type& val)
+  {
+    auto it = ids_.find(val);
+    if (it == std::end(ids_))
+    {
+      key_type ret = ids_.size();
+      ids_[val] = ret;
+      uniq_.push_back(val);
+      return ret;
+    } else {
+      return it->second;
+    }
+  }
+  void compile()
+  {
+    ids_.clear();
+  }
+  inline const value_type& get(key_type i) const
+  {
+    return uniq_.at(i);
+  }
+  inline key_type size() const
+  {
+    return uniq_.size();
+  }
+private:
+  std::map<value_type, key_type> ids_;
+  vector_type uniq_;
+};
+#endif /* end of include guard: IDENTIFIER_H_D7EE5679 */
diff --git a/kgramstats.cpp b/kgramstats.cpp
index c674e80..30d4407 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
 void rawr::compile(int maxK)
 {
  _maxK = maxK;
-  
-  std::vector<std::vector<token>> tokens;
+  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
  {
    size_t start = 0;
    int end = 0;
-    std::vector<token> tkcor;
+    std::vector<token_id> tkcor;
-    
    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
            }
          }
        }
-      
-        tkcor.push_back(tk);
+        tkcor.push_back(_tokenstore.add(tk));
      }
      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
  emoticons.forms.compile();
  emoticons.terms.compile();
+  // Compile the interned tokens.
+  _tokenstore.compile();
  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
-  std::map<kgram, std::map<token, token_data> > tstats;
+  std::map<kgram, std::map<token_id, token_data> > tstats;
  len = 0;
  for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
-        token f = corpus[i+k];
+        token_id fid = corpus[i+k];
+        const token& f = _tokenstore.get(fid);
-        if (tstats[prefix].count(f) == 0)
+        if (tstats[prefix].count(fid) == 0)
        {
-          tstats[prefix].emplace(f, f);
+          tstats[prefix].emplace(fid, fid);
        }
-                        
-        token_data& td = tstats[prefix].at(f);
+        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
        {
          td.titlecase++;
        }
-      
-        if (std::begin(prefix)->tok.suffix == suffixtype::terminating)
+        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
+        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);
-        
-          if (tstats[term_prefix].count(f) == 0)
+          if (tstats[term_prefix].count(fid) == 0)
          {
-            tstats[term_prefix].emplace(f, f);
+            tstats[term_prefix].emplace(fid, fid);
          }
-        
-          token_data& td2 = tstats[term_prefix].at(f);
+          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    const token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken = next.tok.w.forms.next();
+    const token& interned = _tokenstore.get(next.tok);
-    
+    std::string nextToken = interned.w.forms.next();
    // Apply user-specified transforms
    if (_transform)
    {
-      nextToken = _transform(next.tok.w.canon, nextToken);
+      nextToken = _transform(interned.w.canon, nextToken);
    }
  
    // Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
    if (casing < next.uppercase)
    {
      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
-    } else if ((((cur.rbegin()->type == querytype::sentence)
+    } else {
-          || ((cur.rbegin()->type == querytype::literal)
+      bool capitalize = false;
-            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
-        && (rand() % 2 > 0))
+      if (casing - next.uppercase < next.titlecase)
-      || (casing - next.uppercase < next.titlecase))
+      {
-    {
+        capitalize = true;
-      nextToken[0] = toupper(nextToken[0]);
+      } else if (cur.rbegin()->type == querytype::sentence)
+      {
+        if (rand() % 2 > 0)
+        {
+          capitalize = true;
+        }
+      } else {
+        const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
+        if (lastTok.suffix == suffixtype::terminating &&
+            rand() % 2 > 0)
+        {
+          capitalize = true;
+        }
+      }
+      if (capitalize)
+      {
+        nextToken[0] = toupper(nextToken[0]);
+      }
    }
    
    // Delimiters
-    for (auto& dt : next.tok.delimiters)
+    for (auto& dt : interned.delimiters)
    {
      if (dt.first.status == doublestatus::both)
      {
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
    }
    
    // Terminators
-    if (next.tok.suffix == suffixtype::terminating)
+    if (interned.suffix == suffixtype::terminating)
    {
-      auto term = next.tok.w.terms.next();
+      auto term = interned.w.terms.next();
      nextToken.append(term.form);
      
      if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
      } else {
        nextToken.append(" ");
      }
-    } else if (next.tok.suffix == suffixtype::comma)
+    } else if (interned.suffix == suffixtype::comma)
    {
      nextToken.append(", ");
    } else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
    cur.push_back(next.tok);
    result.append(nextToken);
-        
-    if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
+    if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
    {
      break;
    }
diff --git a/kgramstats.h b/kgramstats.h
index 2ee0e35..49fe04e 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -6,6 +6,7 @@
 #include <list>
 #include <vector>
 #include "histogram.h"
+#include "identifier.h"
 #include <functional>
 #include <set>
@@ -92,6 +93,9 @@ class rawr {
      }
    };
+    using tokenstore = identifier<token>;
+    using token_id = tokenstore::key_type;
    enum class querytype {
      literal,
      sentence
@@ -99,12 +103,12 @@ class rawr {
    struct query {
      querytype type;
-      token tok;
+      token_id tok;
-  
-      query(token tok) : tok(tok), type(querytype::literal) {}
+      query(token_id tok) : tok(tok), type(querytype::literal) {}
-  
-      query(querytype type) : tok(blank_word), type(type) {}
+      query(querytype type) : tok(0), type(type) {}
-  
      bool operator<(const query& other) const
      {
        if (type == other.type)
@@ -126,10 +130,10 @@ class rawr {
                int all;
                int titlecase;
                int uppercase;
-      token tok;
+      token_id tok;
      std::set<int> corpora;
-    
-      token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
+      token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
        };
    
    friend std::ostream& operator<<(std::ostream& os, kgram k);
@@ -140,6 +144,7 @@ class rawr {
        int _maxK;
    bool _compiled = false; 
    std::vector<std::string> _corpora;
+    tokenstore _tokenstore;
        std::map<kgram, std::map<int, token_data>> _stats;
    transform_callback _transform;
    int _min_corpora = 1;
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2018-08-26 22:13:50 -0400
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2018-08-26 22:13:50 -0400
commit	d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree	013285ad6ff9c7d2c2c3174eef99b89485917756
parent	26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
download	rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip

diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h
@@ -0,0 +1,59 @@
		1	#ifndef IDENTIFIER_H_D7EE5679
		2	#define IDENTIFIER_H_D7EE5679
		3
		4	#include <map>
		5	#include <vector>
		6
		7	template <typename T>
		8	class identifier {
		9	public:
		10
		11	using value_type = T;
		12
		13	private:
		14
		15	using vector_type = std::vector<value_type>;
		16
		17	public:
		18
		19	using key_type = typename vector_type::size_type;
		20
		21	key_type add(const value_type& val)
		22	{
		23	auto it = ids_.find(val);
		24
		25	if (it == std::end(ids_))
		26	{
		27	key_type ret = ids_.size();
		28	ids_[val] = ret;
		29
		30	uniq_.push_back(val);
		31
		32	return ret;
		33	} else {
		34	return it->second;
		35	}
		36	}
		37
		38	void compile()
		39	{
		40	ids_.clear();
		41	}
		42
		43	inline const value_type& get(key_type i) const
		44	{
		45	return uniq_.at(i);
		46	}
		47
		48	inline key_type size() const
		49	{
		50	return uniq_.size();
		51	}
		52
		53	private:
		54
		55	std::map<value_type, key_type> ids_;
		56	vector_type uniq_;
		57	};
		58
		59	#endif /* end of include guard: IDENTIFIER_H_D7EE5679 */


diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
55	void rawr::compile(int maxK)	55	void rawr::compile(int maxK)
56	{	56	{
57	_maxK = maxK;	57	_maxK = maxK;
58		58
59	std::vector<std::vector<token>> tokens;	59	std::vector<std::vector<token_id>> tokens;
60	std::set<std::string> thashtags;	60	std::set<std::string> thashtags;
61	std::set<std::string> fv_emoticons;	61	std::set<std::string> fv_emoticons;
62		62
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
120	{	120	{
121	size_t start = 0;	121	size_t start = 0;
122	int end = 0;	122	int end = 0;
123	std::vector<token> tkcor;	123	std::vector<token_id> tkcor;
124		124
125	while (end != std::string::npos)	125	while (end != std::string::npos)
126	{	126	{
127	perprime = (startper + end) * 100 / len;	127	perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
336	}	336	}
337	}	337	}
338	}	338	}
339		339
340	tkcor.push_back(tk);	340	tkcor.push_back(_tokenstore.add(tk));
341	}	341	}
342		342
343	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	343	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
377	emoticons.forms.compile();	377	emoticons.forms.compile();
378	emoticons.terms.compile();	378	emoticons.terms.compile();
379		379
		380	// Compile the interned tokens.
		381	_tokenstore.compile();
		382
380	// kgram distribution	383	// kgram distribution
381	std::cout << "Creating markov chain... 0%" << std::flush;	384	std::cout << "Creating markov chain... 0%" << std::flush;
382	std::map<kgram, std::map<token, token_data> > tstats;	385	std::map<kgram, std::map<token_id, token_data> > tstats;
383		386
384	len = 0;	387	len = 0;
385	for (auto c : tokens)	388	for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
408	}	411	}
409		412
410	kgram prefix(corpus.begin()+i, corpus.begin()+i+k);	413	kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
411	token f = corpus[i+k];	414	token_id fid = corpus[i+k];
		415	const token& f = _tokenstore.get(fid);
412		416
413	if (tstats[prefix].count(f) == 0)	417	if (tstats[prefix].count(fid) == 0)
414	{	418	{
415	tstats[prefix].emplace(f, f);	419	tstats[prefix].emplace(fid, fid);
416	}	420	}
417		421
418	token_data& td = tstats[prefix].at(f);	422	token_data& td = tstats[prefix].at(fid);
419	td.all++;	423	td.all++;
420	td.corpora.insert(corpid);	424	td.corpora.insert(corpid);
421		425
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
426	{	430	{
427	td.titlecase++;	431	td.titlecase++;
428	}	432	}
429		433
430	if (std::begin(prefix)->tok.suffix == suffixtype::terminating)	434	const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
		435	if (startTok.suffix == suffixtype::terminating)
431	{	436	{
432	kgram term_prefix(prefix);	437	kgram term_prefix(prefix);
433	term_prefix.pop_front();	438	term_prefix.pop_front();
434	term_prefix.push_front(wildcardQuery);	439	term_prefix.push_front(wildcardQuery);
435		440
436	if (tstats[term_prefix].count(f) == 0)	441	if (tstats[term_prefix].count(fid) == 0)
437	{	442	{
438	tstats[term_prefix].emplace(f, f);	443	tstats[term_prefix].emplace(fid, fid);
439	}	444	}
440		445
441	token_data& td2 = tstats[term_prefix].at(f);	446	token_data& td2 = tstats[term_prefix].at(fid);
442	td2.all++;	447	td2.all++;
443	td2.corpora.insert(corpid);	448	td2.corpora.insert(corpid);
444		449
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
600	int max = distribution.rbegin()->first;	605	int max = distribution.rbegin()->first;
601	int r = rand() % max;	606	int r = rand() % max;
602	const token_data& next = distribution.upper_bound(r)->second;	607	const token_data& next = distribution.upper_bound(r)->second;
603	std::string nextToken = next.tok.w.forms.next();	608	const token& interned = _tokenstore.get(next.tok);
604		609	std::string nextToken = interned.w.forms.next();
		610
605	// Apply user-specified transforms	611	// Apply user-specified transforms
606	if (_transform)	612	if (_transform)
607	{	613	{
608	nextToken = _transform(next.tok.w.canon, nextToken);	614	nextToken = _transform(interned.w.canon, nextToken);
609	}	615	}
610		616
611	// Determine the casing of the next token. We randomly make the token all	617	// Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
615	if (casing < next.uppercase)	621	if (casing < next.uppercase)
616	{	622	{
617	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	623	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
618	} else if ((((cur.rbegin()->type == querytype::sentence)	624	} else {
619	\|\| ((cur.rbegin()->type == querytype::literal)	625	bool capitalize = false;
620	&& (cur.rbegin()->tok.suffix == suffixtype::terminating)))	626
621	&& (rand() % 2 > 0))	627	if (casing - next.uppercase < next.titlecase)
622	\|\| (casing - next.uppercase < next.titlecase))	628	{
623	{	629	capitalize = true;
624	nextToken[0] = toupper(nextToken[0]);	630	} else if (cur.rbegin()->type == querytype::sentence)
		631	{
		632	if (rand() % 2 > 0)
		633	{
		634	capitalize = true;
		635	}
		636	} else {
		637	const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
		638
		639	if (lastTok.suffix == suffixtype::terminating &&
		640	rand() % 2 > 0)
		641	{
		642	capitalize = true;
		643	}
		644	}
		645
		646	if (capitalize)
		647	{
		648	nextToken[0] = toupper(nextToken[0]);
		649	}
625	}	650	}
626		651
627	// Delimiters	652	// Delimiters
628	for (auto& dt : next.tok.delimiters)	653	for (auto& dt : interned.delimiters)
629	{	654	{
630	if (dt.first.status == doublestatus::both)	655	if (dt.first.status == doublestatus::both)
631	{	656	{
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
692	}	717	}
693		718
694	// Terminators	719	// Terminators
695	if (next.tok.suffix == suffixtype::terminating)	720	if (interned.suffix == suffixtype::terminating)
696	{	721	{
697	auto term = next.tok.w.terms.next();	722	auto term = interned.w.terms.next();
698	nextToken.append(term.form);	723	nextToken.append(term.form);
699		724
700	if (term.newline)	725	if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
703	} else {	728	} else {
704	nextToken.append(" ");	729	nextToken.append(" ");
705	}	730	}
706	} else if (next.tok.suffix == suffixtype::comma)	731	} else if (interned.suffix == suffixtype::comma)
707	{	732	{
708	nextToken.append(", ");	733	nextToken.append(", ");
709	} else {	734	} else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
734		759
735	cur.push_back(next.tok);	760	cur.push_back(next.tok);
736	result.append(nextToken);	761	result.append(nextToken);
737		762
738	if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) \|\| (rand() % 4 == 0)))	763	if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) \|\| (rand() % 4 == 0)))
739	{	764	{
740	break;	765	break;
741	}	766	}


diff --git a/kgramstats.h b/kgramstats.h index 2ee0e35..49fe04e 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -6,6 +6,7 @@
6	#include <list>	6	#include <list>
7	#include <vector>	7	#include <vector>
8	#include "histogram.h"	8	#include "histogram.h"
		9	#include "identifier.h"
9	#include <functional>	10	#include <functional>
10	#include <set>	11	#include <set>
11		12
@@ -92,6 +93,9 @@ class rawr {
92	}	93	}
93	};	94	};
94		95
		96	using tokenstore = identifier<token>;
		97	using token_id = tokenstore::key_type;
		98
95	enum class querytype {	99	enum class querytype {
96	literal,	100	literal,
97	sentence	101	sentence
@@ -99,12 +103,12 @@ class rawr {
99		103
100	struct query {	104	struct query {
101	querytype type;	105	querytype type;
102	token tok;	106	token_id tok;
103		107
104	query(token tok) : tok(tok), type(querytype::literal) {}	108	query(token_id tok) : tok(tok), type(querytype::literal) {}
105		109
106	query(querytype type) : tok(blank_word), type(type) {}	110	query(querytype type) : tok(0), type(type) {}
107		111
108	bool operator<(const query& other) const	112	bool operator<(const query& other) const
109	{	113	{
110	if (type == other.type)	114	if (type == other.type)
@@ -126,10 +130,10 @@ class rawr {
126	int all;	130	int all;
127	int titlecase;	131	int titlecase;
128	int uppercase;	132	int uppercase;
129	token tok;	133	token_id tok;
130	std::set<int> corpora;	134	std::set<int> corpora;
131		135
132	token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}	136	token_data(token_id tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
133	};	137	};
134		138
135	friend std::ostream& operator<<(std::ostream& os, kgram k);	139	friend std::ostream& operator<<(std::ostream& os, kgram k);
@@ -140,6 +144,7 @@ class rawr {
140	int _maxK;	144	int _maxK;
141	bool _compiled = false;	145	bool _compiled = false;
142	std::vector<std::string> _corpora;	146	std::vector<std::string> _corpora;
		147	tokenstore _tokenstore;
143	std::map<kgram, std::map<int, token_data>> _stats;	148	std::map<kgram, std::map<int, token_data>> _stats;
144	transform_callback _transform;	149	transform_callback _transform;
145	int _min_corpora = 1;	150	int _min_corpora = 1;