Pulled the ebooks functionality out into a library

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-05-20 23:14:06 -0400
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-05-20 23:15:10 -0400
commit: 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree: 0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.cpp
parent: a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
download: rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip
1 files changed, 252 insertions, 213 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index a44bf2b..47f3bc0 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -33,32 +33,47 @@
 //  
 #include "kgramstats.h"
-#include <vector>
 #include <iostream>
-#include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <set>
 #include <stack>
-#include "freevars.h"
-#include <fstream>
 #include "prefix_search.h"
 #include <aspell.h>
+#include <fstream>
+const rawr::query rawr::wildcardQuery = {querytype::sentence};
+const rawr::word rawr::blank_word = {""};
-query wildcardQuery {querytype::sentence};
+void rawr::addCorpus(std::string corpus)
-word blank_word {""};
+{
+  _corpora.push_back(corpus);
+}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
-kgramstats::kgramstats(std::string corpus, int maxK)
+void rawr::compile(int maxK)
 {
-  this->maxK = maxK;
+  _maxK = maxK;
  
  std::vector<token> tokens;
  size_t start = 0;
-  int end = 0;
  std::set<std::string> thashtags;
-  freevar fv_emoticons {emoticons, "emoticons.txt"};
+  std::set<std::string> fv_emoticons;
+  
+  std::ifstream fvefile("emoticons.txt");
+  if (fvefile)
+  {
+    std::string line;
+    while (getline(fvefile, line))
+    {
+      fv_emoticons.insert(line);
+      emoticons.forms.add(line);
+    }
+  }
+  
+  fvefile.close();
+  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
@@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
  std::cout << "Tokenizing corpus...   0%" << std::flush;
-  int len = corpus.length();
+  int len = 0;
+  for (auto c : _corpora)
+  {
+    len += c.length();
+  }
+  
+  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
-  while (end != std::string::npos)
+  for (int i = 0; i < _corpora.size(); i++)
  {
-    perprime = end * 100 / len;
+    int end = 0;
-    if (perprime != per)
+    
+    while (end != std::string::npos)
    {
-      per = perprime;
+      perprime = (startper + end) * 100 / len;
+      if (perprime != per)
+      {
+        per = perprime;
      
-      std::cout << "\b\b\b\b" << std::right;
+        std::cout << "\b\b\b\b" << std::right;
-      std::cout.width(3);
+        std::cout.width(3);
-      std::cout << per << "%" << std::flush;
+        std::cout << per << "%" << std::flush;
-    }
+      }
    
-    end = corpus.find(" ", start);
+      end = _corpora[i].find(" ", start);
-    bool emoji = false;
+      bool emoji = false;
-    std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    std::string t = "";
+      std::string t = "";
    
-    if (te.compare("") && te.compare("."))
+      if (te.compare("") && te.compare("."))
-    {
-      // Extract strings of emojis into their own tokens even if they're not space delimited
-      int m = emojis.match(te);
-      emoji = m > 0;
-      if (m == 0) m = 1;
-      t = te.substr(0,m);
-      te = te.substr(m);
-      
-      while (!te.empty())
      {
-        m = emojis.match(te);
+        // Extract strings of emojis into their own tokens even if they're not space delimited
-        if (emoji == (m > 0))
+        int m = emojis.match(te);
+        emoji = m > 0;
+        if (m == 0) m = 1;
+        t = te.substr(0,m);
+        te = te.substr(m);
+      
+        while (!te.empty())
        {
-          if (m == 0) m = 1;
+          m = emojis.match(te);
-          t += te.substr(0,m);
+          if (emoji == (m > 0))
-          te = te.substr(m);
+          {
-        } else {
+            if (m == 0) m = 1;
-          end = start + t.length() - 1;
+            t += te.substr(0,m);
-          break;
+            te = te.substr(m);
+          } else {
+            end = start + t.length() - 1;
+            break;
+          }
        }
-      }
      
-      std::string tc(t);
+        std::string tc(t);
-      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
-      int pst = tc.find_first_not_of("\"([*");
+        int pst = tc.find_first_not_of("\"([*");
-      int dst = tc.find_last_not_of("\")]*.,?!\n");
+        int dst = tc.find_last_not_of("\")]*.,?!\n");
-      std::string canonical("");
+        std::string canonical("");
-      if ((pst != std::string::npos) && (dst != std::string::npos))
+        if ((pst != std::string::npos) && (dst != std::string::npos))
-      {
-        canonical = std::string(tc, pst, dst - pst + 1);
-      }
-      
-      word& w = ([&] () -> word& {
-        // Hashtag freevar
-        if (canonical[0] == '#')
        {
-          thashtags.insert(canonical);
+          canonical = std::string(tc, pst, dst - pst + 1);
-          
-          return hashtags;
        }
-        
+      
-        // Emoticon freevar
+        word& w = ([&] () -> word& {
-        if (emoji)
+          // Hashtag freevar
-        {
+          if (canonical[0] == '#')
-          emoticons.forms.add(canonical);
+          {
+            thashtags.insert(canonical);
          
-          return emoticons;
+            return hashtags;
-        }
+          }
        
-        if ((pst != std::string::npos) && (dst != std::string::npos))
+          // Emoticon freevar
-        {
+          if (emoji)
-          std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
-          if (fv_emoticons.check(emoticon_canon))
          {
-            emoticons.forms.add(emoticon_canon);
+            emoticons.forms.add(canonical);
          
            return emoticons;
          }
-        }
        
-        // Basically any other word
+          if ((pst != std::string::npos) && (dst != std::string::npos))
-        if (canonical_form.count(canonical) == 0)
-        {
-          if (
-            // Legacy freevars should be distinct from tokens containing similar words
-            (canonical.find("$name$") != std::string::npos)
-            // Words with no letters will be mangled by the spell checker
-            || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
-            )
          {
-            canonical_form[canonical] = canonical;
+            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
-            words.emplace(canonical, canonical);
+            if (fv_emoticons.count(emoticon_canon) == 1)
-          } else {
+            {
-            int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
+              emoticons.forms.add(emoticon_canon);
-            if (correct)
+          
+              return emoticons;
+            }
+          }
+        
+          // Basically any other word
+          if (canonical_form.count(canonical) == 0)
+          {
+            if (
+              // Legacy freevars should be distinct from tokens containing similar words
+              (canonical.find("$name$") != std::string::npos)
+              // Words with no letters will be mangled by the spell checker
+              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
+              )
            {
-              words.emplace(canonical, canonical);
              canonical_form[canonical] = canonical;
+              words.emplace(canonical, canonical);
            } else {
-              const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
+              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
-              AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
+              if (correct)
-              const char* replacement = aspell_string_enumeration_next(elements);
-              if (replacement != NULL)
              {
-                std::string sugrep(replacement);
-                canonical_form[canonical] = sugrep;
-          
-                if (words.count(sugrep) == 0)
-                {
-                  words.emplace(sugrep, sugrep);
-                }
-              } else {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
-              }
+              } else {
+                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
+                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
+                const char* replacement = aspell_string_enumeration_next(elements);
+                if (replacement != NULL)
+                {
+                  std::string sugrep(replacement);
+                  canonical_form[canonical] = sugrep;
          
-              delete_aspell_string_enumeration(elements);
+                  if (words.count(sugrep) == 0)
+                  {
+                    words.emplace(sugrep, sugrep);
+                  }
+                } else {
+                  words.emplace(canonical, canonical);
+                  canonical_form[canonical] = canonical;
+                }
+          
+                delete_aspell_string_enumeration(elements);
+              }
            }
          }
-        }
        
-        word& tw = words.at(canonical_form.at(canonical));
+          word& tw = words.at(canonical_form.at(canonical));
-        tw.forms.add(canonical);
+          tw.forms.add(canonical);
        
-        return tw;
+          return tw;
-      })();
+        })();
      
-      token tk(w);
+        token tk(w);
-      tk.raw = t;
+        tk.raw = t;
      
-      for (char c : t)
+        for (char c : t)
-      {
-        if (c == '*')
        {
-          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+          if (c == '*')
-        } else if (c == '[')
+          {
-        {
+            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
-          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+          } else if (c == '[')
-        } else if (c == '(')
+          {
-        {
+            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
-          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+          } else if (c == '(')
-        } else if (c == '"')
+          {
-        {
+            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
-          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+          } else if (c == '"')
-        } else {
+          {
-          break;
+            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+          } else {
+            break;
+          }
        }
-      }
      
-      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+        int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
-      if (backtrack != t.length())
+        if (backtrack != t.length())
-      {
-        std::string ending = t.substr(backtrack);
-        std::string suffix;
-        
-        for (char c : ending)
        {
-          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          std::string ending = t.substr(backtrack);
+          std::string suffix;
+        
+          for (char c : ending)
          {
-            suffix += c;
+            if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+            {
+              suffix += c;
            
-            continue;
+              continue;
-          } else if (c == '\n')
+            } else if (c == '\n')
-          {
-            // At least the end is coming
-            if (suffix.empty())
            {
-              suffix = ".";
+              // At least the end is coming
-            }
+              if (suffix.empty())
+              {
+                suffix = ".";
+              }
            
-            break;
+              break;
-          }
+            }
+          
+            parentype pt = ([&] {
+              switch (c)
+              {
+                case ']': return parentype::square_bracket;
+                case ')': return parentype::paren;
+                case '*': return parentype::asterisk;
+                case '"': return parentype::quote;
+              }
+            })();
          
-          parentype pt = ([&] {
+            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
-            switch (c)
            {
-              case ']': return parentype::square_bracket;
+              tk.delimiters[{pt, doublestatus::opening}]--;
-              case ')': return parentype::paren;
+              tk.delimiters[{pt, doublestatus::both}]++;
-              case '*': return parentype::asterisk;
+            } else {
-              case '"': return parentype::quote;
+              tk.delimiters[{pt, doublestatus::closing}]++;
            }
-          })();
-          
-          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
-          {
-            tk.delimiters[{pt, doublestatus::opening}]--;
-            tk.delimiters[{pt, doublestatus::both}]++;
-          } else {
-            tk.delimiters[{pt, doublestatus::closing}]++;
          }
-        }
        
-        if (suffix == ",")
+          if (suffix == ",")
-        {
+          {
-          tk.suffix = suffixtype::comma;
+            tk.suffix = suffixtype::comma;
-        } else if (!suffix.empty()) {
+          } else if (!suffix.empty()) {
-          tk.suffix = suffixtype::terminating;
+            tk.suffix = suffixtype::terminating;
          
-          w.terms.add(suffix);
+            w.terms.add(suffix);
+          }
        }
-      }
      
-      tokens.push_back(tk);
+        tokens.push_back(tk);
-    }
+      }
-    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+    }
+    
+    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
@@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    
    kgram klist = it.first;
    auto& probtable = it.second;
-    auto& distribution = stats[klist];
+    auto& distribution = _stats[klist];
    int max = 0;
                
    for (auto& kt : probtable)
@@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
+  
+  _compiled = true;
 }
-void printKgram(kgram k)
+std::ostream& operator<<(std::ostream& os, rawr::kgram k)
 {
  for (auto& q : k)
  {
-    if (q.type == querytype::sentence)
+    os << q << " ";
-    {
-      std::cout << "#.# ";
-    } else if (q.type == querytype::literal)
-    {
-      if (q.tok.suffix == suffixtype::terminating)
-      {
-        std::cout << q.tok.w.canon << ". ";
-      } else if (q.tok.suffix == suffixtype::comma)
-      {
-        std::cout << q.tok.w.canon << ", ";
-      } else {
-        std::cout << q.tok.w.canon << " ";
-      }
-    }
  }
+  
+  return os;
+}
+std::ostream& operator<<(std::ostream& os, rawr::query q)
+{
+  if (q.type == rawr::querytype::sentence)
+  {
+    return os << "#.#";
+  } else if (q.type == rawr::querytype::literal)
+  {
+    return os << q.tok;
+  }
+  
+  return os;
+}
+std::ostream& operator<<(std::ostream& os, rawr::token t)
+{
+  os << t.w.canon;
+  
+  if (t.suffix == rawr::suffixtype::terminating)
+  {
+    return os << ".";
+  } else if (t.suffix == rawr::suffixtype::comma)
+  {
+    return os << ",";
+  } else {
+    return os;
+  }
+}
+void rawr::setTransformCallback(transform_callback _arg)
+{
+  _transform = _arg;
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::string kgramstats::randomSentence(int maxL)
+std::string rawr::randomSentence(int maxL)
 {
+  if (!_compiled)
+  {
+    return "";
+  }
+  
  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
@@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL)
        
  for (;;)
  {
-    if (cur.size() == maxK)
+    if (cur.size() == _maxK)
    {
      cur.pop_front();
    }
    
    if (cur.size() > 0)
    {
-      if (rand() % (maxK - cur.size() + 1) == 0)
+      if (rand() % (_maxK - cur.size() + 1) == 0)
      {
        while ((cur.size() > 2) && (cuts > 0))
        {
@@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL)
    
    // Gotta circumvent the last line of the input corpus
    // https://twitter.com/starla4444/status/684222271339237376
-    if (stats.count(cur) == 0)
+    if (_stats.count(cur) == 0)
    {
      cur = kgram(1, wildcardQuery);
    }
-    auto& distribution = stats[cur];
+    auto& distribution = _stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
    std::string nextToken = next.tok.w.forms.next();
+    
+    // Apply user-specified transforms
+    if (_transform)
+    {
+      nextToken = _transform(next.tok.w.canon, nextToken);
+    }
  
    // Determine the casing of the next token. We randomly make the token all
    // caps based on the markov chain. Otherwise, we check if the previous
@@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL)
    }
                
    /* DEBUG */
-    printKgram(cur);
+    std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
    cur.push_back(next.tok);
                
@@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL)
    open_delimiters.pop();
  }
  
-  // Replace old-style freevars while I can't be bothered to remake the corpus yet
+  result.resize(maxL);
-  std::vector<std::string> fv_names;
-  std::ifstream namefile("names.txt");
-  if (namefile.is_open())
-  {
-    while (!namefile.eof())
-    {
-      std::string l;
-      getline(namefile, l);
-      if (l.back() == '\r')
-      {
-        l.pop_back();
-      }
-      
-      fv_names.push_back(l);
-    }
-  
-    int cpos;
-    while ((cpos = result.find("$name$")) != std::string::npos)
-    {
-      result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
-    }
-  }
        
  return result;
 }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-05-20 23:14:06 -0400
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-05-20 23:15:10 -0400
commit	8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree	0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.cpp
parent	a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
download	rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2 rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip

diff --git a/kgramstats.cpp b/kgramstats.cpp index a44bf2b..47f3bc0 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -33,32 +33,47 @@
33	//	33	//
34		34
35	#include "kgramstats.h"	35	#include "kgramstats.h"
36	#include <vector>
37	#include <iostream>	36	#include <iostream>
38	#include <cstdlib>
39	#include <cstring>	37	#include <cstring>
40	#include <algorithm>	38	#include <algorithm>
41	#include <set>	39	#include <set>
42	#include <stack>	40	#include <stack>
43	#include "freevars.h"
44	#include <fstream>
45	#include "prefix_search.h"	41	#include "prefix_search.h"
46	#include <aspell.h>	42	#include <aspell.h>
		43	#include <fstream>
		44
		45	const rawr::query rawr::wildcardQuery = {querytype::sentence};
		46	const rawr::word rawr::blank_word = {""};
47		47
48	query wildcardQuery {querytype::sentence};	48	void rawr::addCorpus(std::string corpus)
49	word blank_word {""};	49	{
		50	_corpora.push_back(corpus);
		51	}
50		52
51	// runs in O(t^2) time where t is the number of tokens in the input corpus	53	// runs in O(t^2) time where t is the number of tokens in the input corpus
52	// We consider maxK to be fairly constant	54	// We consider maxK to be fairly constant
53	kgramstats::kgramstats(std::string corpus, int maxK)	55	void rawr::compile(int maxK)
54	{	56	{
55	this->maxK = maxK;	57	_maxK = maxK;
56		58
57	std::vector<token> tokens;	59	std::vector<token> tokens;
58	size_t start = 0;	60	size_t start = 0;
59	int end = 0;
60	std::set<std::string> thashtags;	61	std::set<std::string> thashtags;
61	freevar fv_emoticons {emoticons, "emoticons.txt"};	62	std::set<std::string> fv_emoticons;
		63
		64	std::ifstream fvefile("emoticons.txt");
		65	if (fvefile)
		66	{
		67	std::string line;
		68	while (getline(fvefile, line))
		69	{
		70	fv_emoticons.insert(line);
		71	emoticons.forms.add(line);
		72	}
		73	}
		74
		75	fvefile.close();
		76
62	std::map<std::string, std::string> canonical_form;	77	std::map<std::string, std::string> canonical_form;
63		78
64	AspellConfig* spell_config = new_aspell_config();	79	AspellConfig* spell_config = new_aspell_config();
@@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK)
92	}	107	}
93		108
94	std::cout << "Tokenizing corpus... 0%" << std::flush;	109	std::cout << "Tokenizing corpus... 0%" << std::flush;
95	int len = corpus.length();	110	int len = 0;
		111	for (auto c : _corpora)
		112	{
		113	len += c.length();
		114	}
		115
		116	int startper = 0;
96	int per = 0;	117	int per = 0;
97	int perprime = 0;	118	int perprime = 0;
98	std::cout.fill(' ');	119	std::cout.fill(' ');
99	while (end != std::string::npos)	120	for (int i = 0; i < _corpora.size(); i++)
100	{	121	{
101	perprime = end * 100 / len;	122	int end = 0;
102	if (perprime != per)	123
		124	while (end != std::string::npos)
103	{	125	{
104	per = perprime;	126	perprime = (startper + end) * 100 / len;
		127	if (perprime != per)
		128	{
		129	per = perprime;
105		130
106	std::cout << "\b\b\b\b" << std::right;	131	std::cout << "\b\b\b\b" << std::right;
107	std::cout.width(3);	132	std::cout.width(3);
108	std::cout << per << "%" << std::flush;	133	std::cout << per << "%" << std::flush;
109	}	134	}
110		135
111	end = corpus.find(" ", start);	136	end = _corpora[i].find(" ", start);
112		137
113	bool emoji = false;	138	bool emoji = false;
114	std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);	139	std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start);
115	std::string t = "";	140	std::string t = "";
116		141
117	if (te.compare("") && te.compare("."))	142	if (te.compare("") && te.compare("."))
118	{
119	// Extract strings of emojis into their own tokens even if they're not space delimited
120	int m = emojis.match(te);
121	emoji = m > 0;
122	if (m == 0) m = 1;
123	t = te.substr(0,m);
124	te = te.substr(m);
125
126	while (!te.empty())
127	{	143	{
128	m = emojis.match(te);	144	// Extract strings of emojis into their own tokens even if they're not space delimited
129	if (emoji == (m > 0))	145	int m = emojis.match(te);
		146	emoji = m > 0;
		147	if (m == 0) m = 1;
		148	t = te.substr(0,m);
		149	te = te.substr(m);
		150
		151	while (!te.empty())
130	{	152	{
131	if (m == 0) m = 1;	153	m = emojis.match(te);
132	t += te.substr(0,m);	154	if (emoji == (m > 0))
133	te = te.substr(m);	155	{
134	} else {	156	if (m == 0) m = 1;
135	end = start + t.length() - 1;	157	t += te.substr(0,m);
136	break;	158	te = te.substr(m);
		159	} else {
		160	end = start + t.length() - 1;
		161	break;
		162	}
137	}	163	}
138	}
139		164
140	std::string tc(t);	165	std::string tc(t);
141	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);	166	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
142		167
143	int pst = tc.find_first_not_of("\"([*");	168	int pst = tc.find_first_not_of("\"([*");
144	int dst = tc.find_last_not_of("\")]*.,?!\n");	169	int dst = tc.find_last_not_of("\")]*.,?!\n");
145	std::string canonical("");	170	std::string canonical("");
146	if ((pst != std::string::npos) && (dst != std::string::npos))	171	if ((pst != std::string::npos) && (dst != std::string::npos))
147	{
148	canonical = std::string(tc, pst, dst - pst + 1);
149	}
150
151	word& w = ([&] () -> word& {
152	// Hashtag freevar
153	if (canonical[0] == '#')
154	{	172	{
155	thashtags.insert(canonical);	173	canonical = std::string(tc, pst, dst - pst + 1);
156
157	return hashtags;
158	}	174	}
159		175
160	// Emoticon freevar	176	word& w = ([&] () -> word& {
161	if (emoji)	177	// Hashtag freevar
162	{	178	if (canonical[0] == '#')
163	emoticons.forms.add(canonical);	179	{
		180	thashtags.insert(canonical);
164		181
165	return emoticons;	182	return hashtags;
166	}	183	}
167		184
168	if ((pst != std::string::npos) && (dst != std::string::npos))	185	// Emoticon freevar
169	{	186	if (emoji)
170	std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
171	if (fv_emoticons.check(emoticon_canon))
172	{	187	{
173	emoticons.forms.add(emoticon_canon);	188	emoticons.forms.add(canonical);
174		189
175	return emoticons;	190	return emoticons;
176	}	191	}
177	}
178		192
179	// Basically any other word	193	if ((pst != std::string::npos) && (dst != std::string::npos))
180	if (canonical_form.count(canonical) == 0)
181	{
182	if (
183	// Legacy freevars should be distinct from tokens containing similar words
184	(canonical.find("$name$") != std::string::npos)
185	// Words with no letters will be mangled by the spell checker
186	\|\| (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
187	)
188	{	194	{
189	canonical_form[canonical] = canonical;	195	std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
190	words.emplace(canonical, canonical);	196	if (fv_emoticons.count(emoticon_canon) == 1)
191	} else {	197	{
192	int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());	198	emoticons.forms.add(emoticon_canon);
193	if (correct)	199
		200	return emoticons;
		201	}
		202	}
		203
		204	// Basically any other word
		205	if (canonical_form.count(canonical) == 0)
		206	{
		207	if (
		208	// Legacy freevars should be distinct from tokens containing similar words
		209	(canonical.find("$name$") != std::string::npos)
		210	// Words with no letters will be mangled by the spell checker
		211	\|\| (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
		212	)
194	{	213	{
195	words.emplace(canonical, canonical);
196	canonical_form[canonical] = canonical;	214	canonical_form[canonical] = canonical;
		215	words.emplace(canonical, canonical);
197	} else {	216	} else {
198	const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());	217	int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
199	AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);	218	if (correct)
200	const char* replacement = aspell_string_enumeration_next(elements);
201	if (replacement != NULL)
202	{	219	{
203	std::string sugrep(replacement);
204	canonical_form[canonical] = sugrep;
205
206	if (words.count(sugrep) == 0)
207	{
208	words.emplace(sugrep, sugrep);
209	}
210	} else {
211	words.emplace(canonical, canonical);	220	words.emplace(canonical, canonical);
212	canonical_form[canonical] = canonical;	221	canonical_form[canonical] = canonical;
213	}	222	} else {
		223	const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
		224	AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
		225	const char* replacement = aspell_string_enumeration_next(elements);
		226	if (replacement != NULL)
		227	{
		228	std::string sugrep(replacement);
		229	canonical_form[canonical] = sugrep;
214		230
215	delete_aspell_string_enumeration(elements);	231	if (words.count(sugrep) == 0)
		232	{
		233	words.emplace(sugrep, sugrep);
		234	}
		235	} else {
		236	words.emplace(canonical, canonical);
		237	canonical_form[canonical] = canonical;
		238	}
		239
		240	delete_aspell_string_enumeration(elements);
		241	}
216	}	242	}
217	}	243	}
218	}
219		244
220	word& tw = words.at(canonical_form.at(canonical));	245	word& tw = words.at(canonical_form.at(canonical));
221	tw.forms.add(canonical);	246	tw.forms.add(canonical);
222		247
223	return tw;	248	return tw;
224	})();	249	})();
225		250
226	token tk(w);	251	token tk(w);
227	tk.raw = t;	252	tk.raw = t;
228		253
229	for (char c : t)	254	for (char c : t)
230	{
231	if (c == '*')
232	{	255	{
233	tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;	256	if (c == '*')
234	} else if (c == '[')	257	{
235	{	258	tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
236	tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;	259	} else if (c == '[')
237	} else if (c == '(')	260	{
238	{	261	tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
239	tk.delimiters[{parentype::paren, doublestatus::opening}]++;	262	} else if (c == '(')
240	} else if (c == '"')	263	{
241	{	264	tk.delimiters[{parentype::paren, doublestatus::opening}]++;
242	tk.delimiters[{parentype::quote, doublestatus::opening}]++;	265	} else if (c == '"')
243	} else {	266	{
244	break;	267	tk.delimiters[{parentype::quote, doublestatus::opening}]++;
		268	} else {
		269	break;
		270	}
245	}	271	}
246	}
247		272
248	int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;	273	int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
249	if (backtrack != t.length())	274	if (backtrack != t.length())
250	{
251	std::string ending = t.substr(backtrack);
252	std::string suffix;
253
254	for (char c : ending)
255	{	275	{
256	if ((c == '.') \|\| (c == ',') \|\| (c == '?') \|\| (c == '!'))	276	std::string ending = t.substr(backtrack);
		277	std::string suffix;
		278
		279	for (char c : ending)
257	{	280	{
258	suffix += c;	281	if ((c == '.') \|\| (c == ',') \|\| (c == '?') \|\| (c == '!'))
		282	{
		283	suffix += c;
259		284
260	continue;	285	continue;
261	} else if (c == '\n')	286	} else if (c == '\n')
262	{
263	// At least the end is coming
264	if (suffix.empty())
265	{	287	{
266	suffix = ".";	288	// At least the end is coming
267	}	289	if (suffix.empty())
		290	{
		291	suffix = ".";
		292	}
268		293
269	break;	294	break;
270	}	295	}
		296
		297	parentype pt = ([&] {
		298	switch (c)
		299	{
		300	case ']': return parentype::square_bracket;
		301	case ')': return parentype::paren;
		302	case '*': return parentype::asterisk;
		303	case '"': return parentype::quote;
		304	}
		305	})();
271		306
272	parentype pt = ([&] {	307	if (tk.delimiters[{pt, doublestatus::opening}] > 0)
273	switch (c)
274	{	308	{
275	case ']': return parentype::square_bracket;	309	tk.delimiters[{pt, doublestatus::opening}]--;
276	case ')': return parentype::paren;	310	tk.delimiters[{pt, doublestatus::both}]++;
277	case '*': return parentype::asterisk;	311	} else {
278	case '"': return parentype::quote;	312	tk.delimiters[{pt, doublestatus::closing}]++;
279	}	313	}
280	})();
281
282	if (tk.delimiters[{pt, doublestatus::opening}] > 0)
283	{
284	tk.delimiters[{pt, doublestatus::opening}]--;
285	tk.delimiters[{pt, doublestatus::both}]++;
286	} else {
287	tk.delimiters[{pt, doublestatus::closing}]++;
288	}	314	}
289	}
290		315
291	if (suffix == ",")	316	if (suffix == ",")
292	{	317	{
293	tk.suffix = suffixtype::comma;	318	tk.suffix = suffixtype::comma;
294	} else if (!suffix.empty()) {	319	} else if (!suffix.empty()) {
295	tk.suffix = suffixtype::terminating;	320	tk.suffix = suffixtype::terminating;
296		321
297	w.terms.add(suffix);	322	w.terms.add(suffix);
		323	}
298	}	324	}
299	}
300		325
301	tokens.push_back(tk);	326	tokens.push_back(tk);
302	}	327	}
303		328
304	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	329	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
		330	}
		331
		332	startper += _corpora[i].length();
305	}	333	}
306		334
307	std::cout << "\b\b\b\b100%" << std::endl;	335	std::cout << "\b\b\b\b100%" << std::endl;
@@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
420		448
421	kgram klist = it.first;	449	kgram klist = it.first;
422	auto& probtable = it.second;	450	auto& probtable = it.second;
423	auto& distribution = stats[klist];	451	auto& distribution = _stats[klist];
424	int max = 0;	452	int max = 0;
425		453
426	for (auto& kt : probtable)	454	for (auto& kt : probtable)
@@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK)
432	}	460	}
433		461
434	std::cout << "\b\b\b\b100%" << std::endl;	462	std::cout << "\b\b\b\b100%" << std::endl;
		463
		464	_compiled = true;
435	}	465	}
436		466
437	void printKgram(kgram k)	467	std::ostream& operator<<(std::ostream& os, rawr::kgram k)
438	{	468	{
439	for (auto& q : k)	469	for (auto& q : k)
440	{	470	{
441	if (q.type == querytype::sentence)	471	os << q << " ";
442	{
443	std::cout << "#.# ";
444	} else if (q.type == querytype::literal)
445	{
446	if (q.tok.suffix == suffixtype::terminating)
447	{
448	std::cout << q.tok.w.canon << ". ";
449	} else if (q.tok.suffix == suffixtype::comma)
450	{
451	std::cout << q.tok.w.canon << ", ";
452	} else {
453	std::cout << q.tok.w.canon << " ";
454	}
455	}
456	}	472	}
		473
		474	return os;
		475	}
		476
		477	std::ostream& operator<<(std::ostream& os, rawr::query q)
		478	{
		479	if (q.type == rawr::querytype::sentence)
		480	{
		481	return os << "#.#";
		482	} else if (q.type == rawr::querytype::literal)
		483	{
		484	return os << q.tok;
		485	}
		486
		487	return os;
		488	}
		489
		490	std::ostream& operator<<(std::ostream& os, rawr::token t)
		491	{
		492	os << t.w.canon;
		493
		494	if (t.suffix == rawr::suffixtype::terminating)
		495	{
		496	return os << ".";
		497	} else if (t.suffix == rawr::suffixtype::comma)
		498	{
		499	return os << ",";
		500	} else {
		501	return os;
		502	}
		503	}
		504
		505	void rawr::setTransformCallback(transform_callback _arg)
		506	{
		507	_transform = _arg;
457	}	508	}
458		509
459	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus	510	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
460	std::string kgramstats::randomSentence(int maxL)	511	std::string rawr::randomSentence(int maxL)
461	{	512	{
		513	if (!_compiled)
		514	{
		515	return "";
		516	}
		517
462	std::string result;	518	std::string result;
463	kgram cur(1, wildcardQuery);	519	kgram cur(1, wildcardQuery);
464	int cuts = 0;	520	int cuts = 0;
@@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL)
466		522
467	for (;;)	523	for (;;)
468	{	524	{
469	if (cur.size() == maxK)	525	if (cur.size() == _maxK)
470	{	526	{
471	cur.pop_front();	527	cur.pop_front();
472	}	528	}
473		529
474	if (cur.size() > 0)	530	if (cur.size() > 0)
475	{	531	{
476	if (rand() % (maxK - cur.size() + 1) == 0)	532	if (rand() % (_maxK - cur.size() + 1) == 0)
477	{	533	{
478	while ((cur.size() > 2) && (cuts > 0))	534	while ((cur.size() > 2) && (cuts > 0))
479	{	535	{
@@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL)
490		546
491	// Gotta circumvent the last line of the input corpus	547	// Gotta circumvent the last line of the input corpus
492	// https://twitter.com/starla4444/status/684222271339237376	548	// https://twitter.com/starla4444/status/684222271339237376
493	if (stats.count(cur) == 0)	549	if (_stats.count(cur) == 0)
494	{	550	{
495	cur = kgram(1, wildcardQuery);	551	cur = kgram(1, wildcardQuery);
496	}	552	}
497		553
498	auto& distribution = stats[cur];	554	auto& distribution = _stats[cur];
499	int max = distribution.rbegin()->first;	555	int max = distribution.rbegin()->first;
500	int r = rand() % max;	556	int r = rand() % max;
501	token_data& next = distribution.upper_bound(r)->second;	557	token_data& next = distribution.upper_bound(r)->second;
502	std::string nextToken = next.tok.w.forms.next();	558	std::string nextToken = next.tok.w.forms.next();
		559
		560	// Apply user-specified transforms
		561	if (_transform)
		562	{
		563	nextToken = _transform(next.tok.w.canon, nextToken);
		564	}
503		565
504	// Determine the casing of the next token. We randomly make the token all	566	// Determine the casing of the next token. We randomly make the token all
505	// caps based on the markov chain. Otherwise, we check if the previous	567	// caps based on the markov chain. Otherwise, we check if the previous
@@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL)
600	}	662	}
601		663
602	/* DEBUG */	664	/* DEBUG */
603	printKgram(cur);	665	std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
604	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
605		666
606	cur.push_back(next.tok);	667	cur.push_back(next.tok);
607		668
@@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL)
633	open_delimiters.pop();	694	open_delimiters.pop();
634	}	695	}
635		696
636	// Replace old-style freevars while I can't be bothered to remake the corpus yet	697	result.resize(maxL);
637	std::vector<std::string> fv_names;
638	std::ifstream namefile("names.txt");
639	if (namefile.is_open())
640	{
641	while (!namefile.eof())
642	{
643	std::string l;
644	getline(namefile, l);
645	if (l.back() == '\r')
646	{
647	l.pop_back();
648	}
649
650	fv_names.push_back(l);
651	}
652
653	int cpos;
654	while ((cpos = result.find("$name$")) != std::string::npos)
655	{
656	result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
657	}
658	}
659		698
660	return result;	699	return result;
661	}	700	}