Interned tokens to reduce memory footprint

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2018-08-26 22:13:50 -0400
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2018-08-26 22:13:50 -0400
commit: d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree: 013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.cpp
parent: 26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
download: rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2
rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip
1 files changed, 60 insertions, 35 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index c674e80..30d4407 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
 void rawr::compile(int maxK)
 {
  _maxK = maxK;
-  
-  std::vector<std::vector<token>> tokens;
+  std::vector<std::vector<token_id>> tokens;
  std::set<std::string> thashtags;
  std::set<std::string> fv_emoticons;
  
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
  {
    size_t start = 0;
    int end = 0;
-    std::vector<token> tkcor;
+    std::vector<token_id> tkcor;
-    
    while (end != std::string::npos)
    {
      perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
            }
          }
        }
-      
-        tkcor.push_back(tk);
+        tkcor.push_back(_tokenstore.add(tk));
      }
      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
  emoticons.forms.compile();
  emoticons.terms.compile();
+  // Compile the interned tokens.
+  _tokenstore.compile();
  // kgram distribution
  std::cout << "Creating markov chain...   0%" << std::flush;
-  std::map<kgram, std::map<token, token_data> > tstats;
+  std::map<kgram, std::map<token_id, token_data> > tstats;
  len = 0;
  for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
        }
      
        kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
-        token f = corpus[i+k];
+        token_id fid = corpus[i+k];
+        const token& f = _tokenstore.get(fid);
-        if (tstats[prefix].count(f) == 0)
+        if (tstats[prefix].count(fid) == 0)
        {
-          tstats[prefix].emplace(f, f);
+          tstats[prefix].emplace(fid, fid);
        }
-                        
-        token_data& td = tstats[prefix].at(f);
+        token_data& td = tstats[prefix].at(fid);
        td.all++;
        td.corpora.insert(corpid);
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
        {
          td.titlecase++;
        }
-      
-        if (std::begin(prefix)->tok.suffix == suffixtype::terminating)
+        const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
+        if (startTok.suffix == suffixtype::terminating)
        {
          kgram term_prefix(prefix);
          term_prefix.pop_front();
          term_prefix.push_front(wildcardQuery);
-        
-          if (tstats[term_prefix].count(f) == 0)
+          if (tstats[term_prefix].count(fid) == 0)
          {
-            tstats[term_prefix].emplace(f, f);
+            tstats[term_prefix].emplace(fid, fid);
          }
-        
-          token_data& td2 = tstats[term_prefix].at(f);
+          token_data& td2 = tstats[term_prefix].at(fid);
          td2.all++;
          td2.corpora.insert(corpid);
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    const token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken = next.tok.w.forms.next();
+    const token& interned = _tokenstore.get(next.tok);
-    
+    std::string nextToken = interned.w.forms.next();
    // Apply user-specified transforms
    if (_transform)
    {
-      nextToken = _transform(next.tok.w.canon, nextToken);
+      nextToken = _transform(interned.w.canon, nextToken);
    }
  
    // Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
    if (casing < next.uppercase)
    {
      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
-    } else if ((((cur.rbegin()->type == querytype::sentence)
+    } else {
-          || ((cur.rbegin()->type == querytype::literal)
+      bool capitalize = false;
-            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
-        && (rand() % 2 > 0))
+      if (casing - next.uppercase < next.titlecase)
-      || (casing - next.uppercase < next.titlecase))
+      {
-    {
+        capitalize = true;
-      nextToken[0] = toupper(nextToken[0]);
+      } else if (cur.rbegin()->type == querytype::sentence)
+      {
+        if (rand() % 2 > 0)
+        {
+          capitalize = true;
+        }
+      } else {
+        const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
+        if (lastTok.suffix == suffixtype::terminating &&
+            rand() % 2 > 0)
+        {
+          capitalize = true;
+        }
+      }
+      if (capitalize)
+      {
+        nextToken[0] = toupper(nextToken[0]);
+      }
    }
    
    // Delimiters
-    for (auto& dt : next.tok.delimiters)
+    for (auto& dt : interned.delimiters)
    {
      if (dt.first.status == doublestatus::both)
      {
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
    }
    
    // Terminators
-    if (next.tok.suffix == suffixtype::terminating)
+    if (interned.suffix == suffixtype::terminating)
    {
-      auto term = next.tok.w.terms.next();
+      auto term = interned.w.terms.next();
      nextToken.append(term.form);
      
      if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
      } else {
        nextToken.append(" ");
      }
-    } else if (next.tok.suffix == suffixtype::comma)
+    } else if (interned.suffix == suffixtype::comma)
    {
      nextToken.append(", ");
    } else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
    cur.push_back(next.tok);
    result.append(nextToken);
-        
-    if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
+    if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0)))
    {
      break;
    }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2018-08-26 22:13:50 -0400
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2018-08-26 22:13:50 -0400
commit	d75685e69f9a5d3cfc255aa921005fc40ae6e585 (patch)
tree	013285ad6ff9c7d2c2c3174eef99b89485917756 /kgramstats.cpp
parent	26d75f744913a8856e46f5fccbfda8f8336924a0 (diff)
download	rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.gz rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.tar.bz2 rawr-ebooks-d75685e69f9a5d3cfc255aa921005fc40ae6e585.zip

diff --git a/kgramstats.cpp b/kgramstats.cpp index c674e80..30d4407 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -55,8 +55,8 @@ void rawr::addCorpus(std::string corpus)
55	void rawr::compile(int maxK)	55	void rawr::compile(int maxK)
56	{	56	{
57	_maxK = maxK;	57	_maxK = maxK;
58		58
59	std::vector<std::vector<token>> tokens;	59	std::vector<std::vector<token_id>> tokens;
60	std::set<std::string> thashtags;	60	std::set<std::string> thashtags;
61	std::set<std::string> fv_emoticons;	61	std::set<std::string> fv_emoticons;
62		62
@@ -120,8 +120,8 @@ void rawr::compile(int maxK)
120	{	120	{
121	size_t start = 0;	121	size_t start = 0;
122	int end = 0;	122	int end = 0;
123	std::vector<token> tkcor;	123	std::vector<token_id> tkcor;
124		124
125	while (end != std::string::npos)	125	while (end != std::string::npos)
126	{	126	{
127	perprime = (startper + end) * 100 / len;	127	perprime = (startper + end) * 100 / len;
@@ -336,8 +336,8 @@ void rawr::compile(int maxK)
336	}	336	}
337	}	337	}
338	}	338	}
339		339
340	tkcor.push_back(tk);	340	tkcor.push_back(_tokenstore.add(tk));
341	}	341	}
342		342
343	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	343	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
@@ -377,9 +377,12 @@ void rawr::compile(int maxK)
377	emoticons.forms.compile();	377	emoticons.forms.compile();
378	emoticons.terms.compile();	378	emoticons.terms.compile();
379		379
		380	// Compile the interned tokens.
		381	_tokenstore.compile();
		382
380	// kgram distribution	383	// kgram distribution
381	std::cout << "Creating markov chain... 0%" << std::flush;	384	std::cout << "Creating markov chain... 0%" << std::flush;
382	std::map<kgram, std::map<token, token_data> > tstats;	385	std::map<kgram, std::map<token_id, token_data> > tstats;
383		386
384	len = 0;	387	len = 0;
385	for (auto c : tokens)	388	for (auto c : tokens)
@@ -408,14 +411,15 @@ void rawr::compile(int maxK)
408	}	411	}
409		412
410	kgram prefix(corpus.begin()+i, corpus.begin()+i+k);	413	kgram prefix(corpus.begin()+i, corpus.begin()+i+k);
411	token f = corpus[i+k];	414	token_id fid = corpus[i+k];
		415	const token& f = _tokenstore.get(fid);
412		416
413	if (tstats[prefix].count(f) == 0)	417	if (tstats[prefix].count(fid) == 0)
414	{	418	{
415	tstats[prefix].emplace(f, f);	419	tstats[prefix].emplace(fid, fid);
416	}	420	}
417		421
418	token_data& td = tstats[prefix].at(f);	422	token_data& td = tstats[prefix].at(fid);
419	td.all++;	423	td.all++;
420	td.corpora.insert(corpid);	424	td.corpora.insert(corpid);
421		425
@@ -426,19 +430,20 @@ void rawr::compile(int maxK)
426	{	430	{
427	td.titlecase++;	431	td.titlecase++;
428	}	432	}
429		433
430	if (std::begin(prefix)->tok.suffix == suffixtype::terminating)	434	const token& startTok = _tokenstore.get(std::begin(prefix)->tok);
		435	if (startTok.suffix == suffixtype::terminating)
431	{	436	{
432	kgram term_prefix(prefix);	437	kgram term_prefix(prefix);
433	term_prefix.pop_front();	438	term_prefix.pop_front();
434	term_prefix.push_front(wildcardQuery);	439	term_prefix.push_front(wildcardQuery);
435		440
436	if (tstats[term_prefix].count(f) == 0)	441	if (tstats[term_prefix].count(fid) == 0)
437	{	442	{
438	tstats[term_prefix].emplace(f, f);	443	tstats[term_prefix].emplace(fid, fid);
439	}	444	}
440		445
441	token_data& td2 = tstats[term_prefix].at(f);	446	token_data& td2 = tstats[term_prefix].at(fid);
442	td2.all++;	447	td2.all++;
443	td2.corpora.insert(corpid);	448	td2.corpora.insert(corpid);
444		449
@@ -600,12 +605,13 @@ std::string rawr::randomSentence(int maxL) const
600	int max = distribution.rbegin()->first;	605	int max = distribution.rbegin()->first;
601	int r = rand() % max;	606	int r = rand() % max;
602	const token_data& next = distribution.upper_bound(r)->second;	607	const token_data& next = distribution.upper_bound(r)->second;
603	std::string nextToken = next.tok.w.forms.next();	608	const token& interned = _tokenstore.get(next.tok);
604		609	std::string nextToken = interned.w.forms.next();
		610
605	// Apply user-specified transforms	611	// Apply user-specified transforms
606	if (_transform)	612	if (_transform)
607	{	613	{
608	nextToken = _transform(next.tok.w.canon, nextToken);	614	nextToken = _transform(interned.w.canon, nextToken);
609	}	615	}
610		616
611	// Determine the casing of the next token. We randomly make the token all	617	// Determine the casing of the next token. We randomly make the token all
@@ -615,17 +621,36 @@ std::string rawr::randomSentence(int maxL) const
615	if (casing < next.uppercase)	621	if (casing < next.uppercase)
616	{	622	{
617	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	623	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
618	} else if ((((cur.rbegin()->type == querytype::sentence)	624	} else {
619	\|\| ((cur.rbegin()->type == querytype::literal)	625	bool capitalize = false;
620	&& (cur.rbegin()->tok.suffix == suffixtype::terminating)))	626
621	&& (rand() % 2 > 0))	627	if (casing - next.uppercase < next.titlecase)
622	\|\| (casing - next.uppercase < next.titlecase))	628	{
623	{	629	capitalize = true;
624	nextToken[0] = toupper(nextToken[0]);	630	} else if (cur.rbegin()->type == querytype::sentence)
		631	{
		632	if (rand() % 2 > 0)
		633	{
		634	capitalize = true;
		635	}
		636	} else {
		637	const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
		638
		639	if (lastTok.suffix == suffixtype::terminating &&
		640	rand() % 2 > 0)
		641	{
		642	capitalize = true;
		643	}
		644	}
		645
		646	if (capitalize)
		647	{
		648	nextToken[0] = toupper(nextToken[0]);
		649	}
625	}	650	}
626		651
627	// Delimiters	652	// Delimiters
628	for (auto& dt : next.tok.delimiters)	653	for (auto& dt : interned.delimiters)
629	{	654	{
630	if (dt.first.status == doublestatus::both)	655	if (dt.first.status == doublestatus::both)
631	{	656	{
@@ -692,9 +717,9 @@ std::string rawr::randomSentence(int maxL) const
692	}	717	}
693		718
694	// Terminators	719	// Terminators
695	if (next.tok.suffix == suffixtype::terminating)	720	if (interned.suffix == suffixtype::terminating)
696	{	721	{
697	auto term = next.tok.w.terms.next();	722	auto term = interned.w.terms.next();
698	nextToken.append(term.form);	723	nextToken.append(term.form);
699		724
700	if (term.newline)	725	if (term.newline)
@@ -703,7 +728,7 @@ std::string rawr::randomSentence(int maxL) const
703	} else {	728	} else {
704	nextToken.append(" ");	729	nextToken.append(" ");
705	}	730	}
706	} else if (next.tok.suffix == suffixtype::comma)	731	} else if (interned.suffix == suffixtype::comma)
707	{	732	{
708	nextToken.append(", ");	733	nextToken.append(", ");
709	} else {	734	} else {
@@ -734,8 +759,8 @@ std::string rawr::randomSentence(int maxL) const
734		759
735	cur.push_back(next.tok);	760	cur.push_back(next.tok);
736	result.append(nextToken);	761	result.append(nextToken);
737		762
738	if ((next.tok.suffix == suffixtype::terminating) && ((result.length() > maxL) \|\| (rand() % 4 == 0)))	763	if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) \|\| (rand() % 4 == 0)))
739	{	764	{
740	break;	765	break;
741	}	766	}