Rewrote how tokens are handled

A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
commit: b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree: f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.cpp
parent: fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download: rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
1 files changed, 277 insertions, 176 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 4bb7f15..0ab0c99 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -37,35 +37,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
-#include "malaprop.h"
+#include <set>
+#include <stack>
-query wildcardQuery(querytype_sentence);
+query wildcardQuery {querytype::sentence};
+word blank_word {""};
-std::string canonize(std::string f);
-token token_from_string(std::string in)
-{
-  if (in[0] == '#')
-  {
-    token word(tokentype_hashtag);
-    
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  } else {
-    token word(canonize(in));
-  
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  }
-}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
 {
  this->maxK = maxK;
  
-  std::vector<std::string> tokens;
+  std::vector<token> tokens;
  size_t start = 0;
  int end = 0;
  std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  {
    end = corpus.find(" ", start);
-    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    if (token[token.length()-1] == '\n')
+    if (t.compare("") && t.compare("."))
    {
-      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
+      std::string tc(t), canonical;
+      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
+        return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+      });
+      
+      word& w = ([&] () -> word& {
+        // Hashtag freevar
+        if (canonical[0] == '#')
+        {
+          thashtags.insert(canonical);
+          canonical = "#hashtag";
+          
+          return hashtags;
+        }
+        
+        // Basically any other word
+        if (words.count(canonical) == 0)
+        {
+          words.emplace(canonical, canonical);
+        }
+        
+        word& tw = words.at(canonical);
+        tw.forms.add(canonical);
+        
+        return tw;
+      })();
+      
+      token tk(w);
+      tk.raw = t;
+      
+      for (char c : t)
      {
-        token.insert(token.length()-1, ".");
+        if (c == '*')
+        {
+          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+        } else if (c == '[')
+        {
+          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+        } else if (c == '(')
+        {
+          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+        } else if (c == '"')
+        {
+          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+        } else {
+          break;
+        }
      }
-         
-      token.resize(token.length()-1);
-    }
-       
-    if (token.compare("") && token.compare("."))
-    {
-      mstats.addWord(token);
-      tokens.push_back(token);
      
-      if (token[0] == '#')
+      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+      if (backtrack != t.length())
      {
-        thashtags.insert(canonize(token));
+        std::string ending = t.substr(backtrack);
+        std::string suffix;
+        
+        for (char c : ending)
+        {
+          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          {
+            suffix += c;
+            
+            continue;
+          } else if (c == '\n')
+          {
+            // At least the end is coming
+            if (suffix.empty())
+            {
+              suffix = ".";
+            }
+            
+            break;
+          }
+          
+          parentype pt = ([&] {
+            switch (c)
+            {
+              case ']': return parentype::square_bracket;
+              case ')': return parentype::paren;
+              case '*': return parentype::asterisk;
+              case '"': return parentype::quote;
+            }
+          })();
+          
+          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
+          {
+            tk.delimiters[{pt, doublestatus::opening}]--;
+            tk.delimiters[{pt, doublestatus::both}]++;
+          } else {
+            tk.delimiters[{pt, doublestatus::closing}]++;
+          }
+        }
+        
+        if (suffix == ",")
+        {
+          tk.suffix = suffixtype::comma;
+        } else if (!suffix.empty()) {
+          tk.suffix = suffixtype::terminating;
+          
+          w.terms.add(suffix);
+        }
      }
+      
+      tokens.push_back(tk);
    }
    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
  }
  
-  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  // Time to condense the distribution stuff for the words
+  for (auto& it : words)
  {
-    hashtags.push_back(*it);
+    it.second.forms.compile();
+    it.second.terms.compile();
  }
-        
+  
+  // Hashtag freevar is not frequency distributed
+  for (auto& it : thashtags)
+  {
+    hashtags.forms.add(it);
+  }
+  
+  hashtags.forms.compile();
+  hashtags.terms.compile();
+  // kgram distribution
  std::map<kgram, std::map<token, token_data> > tstats;
-  std::map<token, std::map<termstats, int> > tendings;
  for (int k=1; k<maxK; k++)
  {
    for (int i=0; i<(tokens.size() - k); i++)
    {
-      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
-      kgram prefix;
+      token f = tokens[i+k];
-      
-      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
+      if (tstats[prefix].count(f) == 0)
-      {
-        prefix.push_back(token_from_string(*it));
-      }
-      
-      std::string f = tokens[i+k];
-                  std::string canonical = canonize(f);
-      
-      token word(token_from_string(canonical));
-      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        word.terminating = true;
+        tstats[prefix].emplace(f, f);
-        
-        char terminator = f[f.find_last_of(".?!,")];
-        int occurrences = std::count(f.begin(), f.end(), terminator);
-        
-        tendings[word][termstats(terminator, occurrences)]++;
      }
                        
-      token_data& td = tstats[prefix][word];
+      token_data& td = tstats[prefix].at(f);
-      td.word = word;
      td.all++;
-      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td.uppercase++;
-      } else if (isupper(f[0]))
+      } else if (isupper(f.raw[0]))
      {
        td.titlecase++;
      }
      
-      if (prefix.front().word.terminating)
+      kgram term_prefix;
+      bool changed = false;
+      std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
+        if (q.tok.suffix == suffixtype::terminating)
+        {
+          changed = true;
+          
+          return wildcardQuery;
+        } else {
+          return q;
+        }
+      });
+      
+      if (changed)
      {
-        prefix.front() = wildcardQuery;
+        if (tstats[term_prefix].count(f) == 0)
+        {
+          tstats[term_prefix].emplace(f, f);
+        }
        
-        token_data& td2 = tstats[prefix][word];
+        token_data& td2 = tstats[term_prefix].at(f);
-        td2.word = word;
        td2.all++;
-        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td2.uppercase++;
-        } else if (isupper(f[0]))
+        } else if (isupper(f.raw[0]))
        {
          td2.titlecase++;
        }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    }
  }
        
-  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  // Condense the kgram distribution
+  for (auto& it : tstats)
  {
-    kgram klist = it->first;
+    kgram klist = it.first;
-    std::map<token, token_data>& probtable = it->second;
+    auto& probtable = it.second;
-    std::map<int, token_data>& distribution = stats[klist];
+    auto& distribution = stats[klist];
    int max = 0;
                
-    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    for (auto& kt : probtable)
    {
-      max += kt->second.all;
+      max += kt.second.all;
                        
-      distribution[max] = kt->second;
+      distribution.emplace(max, kt.second);
-    }
-  }
-  
-  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
-  {
-    token word = it->first;
-    std::map<termstats, int>& probtable = it->second;
-    std::map<int, termstats>& distribution = endings[word];
-    int max = 0;
-    
-    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-    {
-      max += kt->second;
-      
-      distribution[max] = kt->first;
    }
  }
 }
 void printKgram(kgram k)
 {
-  for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (auto& q : k)
  {
-    query& q = *it;
+    if (q.type == querytype::sentence)
-    if (q.type == querytype_sentence)
    {
      std::cout << "#.# ";
-    } else if (q.type == querytype_literal)
+    } else if (q.type == querytype::literal)
    {
-      if (q.word.type == tokentype_hashtag)
+      if (q.tok.suffix == suffixtype::terminating)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ". ";
-        {
+      } else if (q.tok.suffix == suffixtype::comma)
-          std::cout << "#hashtag. ";
-        } else {
-          std::cout << "#hashtag ";
-        }
-      } else if (q.word.type == tokentype_literal)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ", ";
-        {
+      } else {
-          std::cout << q.word.canon << ". ";
+        std::cout << q.tok.w.canon << " ";
-        } else {
-          std::cout << q.word.canon << " ";
-        }
      }
    }
  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::vector<std::string> kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int n)
 {
-  std::vector<std::string> result;
+  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
+  std::stack<parentype> open_delimiters;
        
  for (int i=0; i<n; i++)
  {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      cur = kgram(1, wildcardQuery);
    }
-    std::map<int, token_data>& distribution = stats[cur];
+    auto& distribution = stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken;
+    std::string nextToken = next.tok.w.forms.next();
-    bool mess = false;
+  
-    
+    // Determine the casing of the next token. We randomly make the token all
-    if (next.word.type == tokentype_literal)
+    // caps based on the markov chain. Otherwise, we check if the previous
+    // token is the end of a sentence (terminating token or a wildcard query).
+    int casing = rand() % next.all;
+    if (casing < next.uppercase)
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype::sentence)
+          || ((cur.rbegin()->type == querytype::literal)
+            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
-      nextToken = next.word.canon;
+      nextToken[0] = toupper(nextToken[0]);
+    }
    
-      mess = (rand() % 100) == 0;
+    // Delimiters
-      if (mess)
+    for (auto& dt : next.tok.delimiters)
+    {
+      if (dt.first.status == doublestatus::both)
      {
-        nextToken = mstats.alternate(nextToken);
+        switch (dt.first.type)
-      }
+        {
-    
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
-      // Determine the casing of the next token. We randomly make the token all
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
-      // caps based on the markov chain. Otherwise, we check if the previous
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
-      // token is the end of a sentence (terminating token or a wildcard query).
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
-      int casing = rand() % next.all;
+        }
-      if (casing < next.uppercase)
+      } else if (dt.first.status == doublestatus::opening)
      {
-        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+        for (int i=0; i<dt.second; i++)
-      } else if ((((cur.rbegin()->type == querytype_sentence)
+        {
-            || ((cur.rbegin()->type == querytype_literal)
+          open_delimiters.push(dt.first.type);
-              && (cur.rbegin()->word.terminating)))
+        }
-          && (rand() % 2 > 0))
+        
-        || (casing - next.uppercase < next.titlecase))
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
+        }
+      } else if (dt.first.status == doublestatus::closing)
      {
-        nextToken[0] = toupper(nextToken[0]);
+        for (int i=0; i<dt.second; i++)
+        {
+          while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
+          {
+            switch (open_delimiters.top())
+            {
+              case parentype::paren: nextToken.append(")"); break;
+              case parentype::square_bracket: nextToken.append("]"); break;
+              case parentype::asterisk: nextToken.append("*"); break;
+              case parentype::quote: nextToken.append("\""); break;
+            }
+            
+            open_delimiters.pop();
+          }
+          
+          if (open_delimiters.empty())
+          {
+            switch (dt.first.type)
+            {
+              case parentype::paren: result = "(" + result; break;
+              case parentype::square_bracket: result = "[" + result; break;
+              case parentype::asterisk: result = "*" + result; break;
+              case parentype::quote: result = "\"" + result; break;
+            }
+          }
+          
+          switch (dt.first.type)
+          {
+            case parentype::paren: nextToken.append(")"); break;
+            case parentype::square_bracket: nextToken.append("]"); break;
+            case parentype::asterisk: nextToken.append("*"); break;
+            case parentype::quote: nextToken.append("\""); break;
+          }
+        }
      }
-    } else if (next.word.type == tokentype_hashtag)
-    {
-      int rhash = rand() % hashtags.size();
-      nextToken = hashtags[rhash];
    }
    
-    if (next.word.terminating)
+    // Terminators
+    if (next.tok.suffix == suffixtype::terminating)
    {
-      std::map<int, termstats>& ending = endings[next.word];
+      nextToken.append(next.tok.w.terms.next());
-      int emax = ending.rbegin()->first;
+    } else if (next.tok.suffix == suffixtype::comma)
-      int er = rand() % emax;
+    {
-      termstats& nextend = ending.upper_bound(er)->second;
+      nextToken.append(",");
-    
-      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
                
    /* DEBUG */
    printKgram(cur);
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
+    cur.push_back(next.tok);
                
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
+    result.append(nextToken + " ");
    
-    if (mess)
+    if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
    {
-      std::cout << " mala " << next.word.canon;
+      break;
    }
-    
-    std::cout << std::endl;
-    cur.push_back(next.word);
-                
-    result.push_back(nextToken);
  }
-        
-  return result;
-}
-bool removeIf(char c)
-{
-  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
-}
-std::string canonize(std::string f)
-{
-  std::string canonical(f);
-  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  std::string result;
+  // Remove the trailing space
-  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  if (result.back() == ' ')
+  {
+    result.pop_back();
+  }
+  
+  // Close any open delimiters
+  while (!open_delimiters.empty())
+  {
+    switch (open_delimiters.top())
+    {
+      case parentype::paren: result.append(")"); break;
+      case parentype::square_bracket: result.append("]"); break;
+      case parentype::asterisk: result.append("*"); break;
+      case parentype::quote: result.append("\""); break;
+    }
+    
+    open_delimiters.pop();
+  }
        
  return result;
 }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
commit	b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree	f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.cpp
parent	fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download	rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip

diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37	#include <iostream>	37	#include <iostream>
38	#include <cstdlib>	38	#include <cstdlib>
39	#include <algorithm>	39	#include <algorithm>
40	#include "malaprop.h"	40	#include <set>
		41	#include <stack>
41		42
42	query wildcardQuery(querytype_sentence);	43	query wildcardQuery {querytype::sentence};
43		44	word blank_word {""};
44	std::string canonize(std::string f);
45
46	token token_from_string(std::string in)
47	{
48	if (in[0] == '#')
49	{
50	token word(tokentype_hashtag);
51
52	if (in.find_first_of(".?!,") != std::string::npos)
53	{
54	word.terminating = true;
55	}
56
57	return word;
58	} else {
59	token word(canonize(in));
60
61	if (in.find_first_of(".?!,") != std::string::npos)
62	{
63	word.terminating = true;
64	}
65
66	return word;
67	}
68	}
69		45
70	// runs in O(t^2) time where t is the number of tokens in the input corpus	46	// runs in O(t^2) time where t is the number of tokens in the input corpus
71	// We consider maxK to be fairly constant	47	// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73	{	49	{
74	this->maxK = maxK;	50	this->maxK = maxK;
75		51
76	std::vector<std::string> tokens;	52	std::vector<token> tokens;
77	size_t start = 0;	53	size_t start = 0;
78	int end = 0;	54	int end = 0;
79	std::set<std::string> thashtags;	55	std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82	{	58	{
83	end = corpus.find(" ", start);	59	end = corpus.find(" ", start);
84		60
85	std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);	61	std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86	if (token[token.length()-1] == '\n')	62	if (t.compare("") && t.compare("."))
87	{	63	{
88	if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))	64	std::string tc(t), canonical;
		65	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
		66	std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
		67	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
		68	});
		69
		70	word& w = ([&] () -> word& {
		71	// Hashtag freevar
		72	if (canonical[0] == '#')
		73	{
		74	thashtags.insert(canonical);
		75	canonical = "#hashtag";
		76
		77	return hashtags;
		78	}
		79
		80	// Basically any other word
		81	if (words.count(canonical) == 0)
		82	{
		83	words.emplace(canonical, canonical);
		84	}
		85
		86	word& tw = words.at(canonical);
		87	tw.forms.add(canonical);
		88
		89	return tw;
		90	})();
		91
		92	token tk(w);
		93	tk.raw = t;
		94
		95	for (char c : t)
89	{	96	{
90	token.insert(token.length()-1, ".");	97	if (c == '*')
		98	{
		99	tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
		100	} else if (c == '[')
		101	{
		102	tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
		103	} else if (c == '(')
		104	{
		105	tk.delimiters[{parentype::paren, doublestatus::opening}]++;
		106	} else if (c == '"')
		107	{
		108	tk.delimiters[{parentype::quote, doublestatus::opening}]++;
		109	} else {
		110	break;
		111	}
91	}	112	}
92
93	token.resize(token.length()-1);
94	}
95
96	if (token.compare("") && token.compare("."))
97	{
98	mstats.addWord(token);
99	tokens.push_back(token);
100		113
101	if (token[0] == '#')	114	int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
		115	if (backtrack != t.length())
102	{	116	{
103	thashtags.insert(canonize(token));	117	std::string ending = t.substr(backtrack);
		118	std::string suffix;
		119
		120	for (char c : ending)
		121	{
		122	if ((c == '.') \|\| (c == ',') \|\| (c == '?') \|\| (c == '!'))
		123	{
		124	suffix += c;
		125
		126	continue;
		127	} else if (c == '\n')
		128	{
		129	// At least the end is coming
		130	if (suffix.empty())
		131	{
		132	suffix = ".";
		133	}
		134
		135	break;
		136	}
		137
		138	parentype pt = ([&] {
		139	switch (c)
		140	{
		141	case ']': return parentype::square_bracket;
		142	case ')': return parentype::paren;
		143	case '*': return parentype::asterisk;
		144	case '"': return parentype::quote;
		145	}
		146	})();
		147
		148	if (tk.delimiters[{pt, doublestatus::opening}] > 0)
		149	{
		150	tk.delimiters[{pt, doublestatus::opening}]--;
		151	tk.delimiters[{pt, doublestatus::both}]++;
		152	} else {
		153	tk.delimiters[{pt, doublestatus::closing}]++;
		154	}
		155	}
		156
		157	if (suffix == ",")
		158	{
		159	tk.suffix = suffixtype::comma;
		160	} else if (!suffix.empty()) {
		161	tk.suffix = suffixtype::terminating;
		162
		163	w.terms.add(suffix);
		164	}
104	}	165	}
		166
		167	tokens.push_back(tk);
105	}	168	}
106		169
107	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	170	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108	}	171	}
109		172
110	for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)	173	// Time to condense the distribution stuff for the words
		174	for (auto& it : words)
111	{	175	{
112	hashtags.push_back(*it);	176	it.second.forms.compile();
		177	it.second.terms.compile();
113	}	178	}
114		179
		180	// Hashtag freevar is not frequency distributed
		181	for (auto& it : thashtags)
		182	{
		183	hashtags.forms.add(it);
		184	}
		185
		186	hashtags.forms.compile();
		187	hashtags.terms.compile();
		188
		189	// kgram distribution
115	std::map<kgram, std::map<token, token_data> > tstats;	190	std::map<kgram, std::map<token, token_data> > tstats;
116	std::map<token, std::map<termstats, int> > tendings;
117	for (int k=1; k<maxK; k++)	191	for (int k=1; k<maxK; k++)
118	{	192	{
119	for (int i=0; i<(tokens.size() - k); i++)	193	for (int i=0; i<(tokens.size() - k); i++)
120	{	194	{
121	std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);	195	kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122	kgram prefix;	196	token f = tokens[i+k];
123		197
124	for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)	198	if (tstats[prefix].count(f) == 0)
125	{
126	prefix.push_back(token_from_string(*it));
127	}
128
129	std::string f = tokens[i+k];
130	std::string canonical = canonize(f);
131
132	token word(token_from_string(canonical));
133	if (f.find_first_of(".?!,") != std::string::npos)
134	{	199	{
135	word.terminating = true;	200	tstats[prefix].emplace(f, f);
136
137	char terminator = f[f.find_last_of(".?!,")];
138	int occurrences = std::count(f.begin(), f.end(), terminator);
139
140	tendings[word][termstats(terminator, occurrences)]++;
141	}	201	}
142		202
143	token_data& td = tstats[prefix][word];	203	token_data& td = tstats[prefix].at(f);
144	td.word = word;
145	td.all++;	204	td.all++;
146		205
147	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())	206	if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148	{	207	{
149	td.uppercase++;	208	td.uppercase++;
150	} else if (isupper(f[0]))	209	} else if (isupper(f.raw[0]))
151	{	210	{
152	td.titlecase++;	211	td.titlecase++;
153	}	212	}
154		213
155	if (prefix.front().word.terminating)	214	kgram term_prefix;
		215	bool changed = false;
		216	std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
		217	if (q.tok.suffix == suffixtype::terminating)
		218	{
		219	changed = true;
		220
		221	return wildcardQuery;
		222	} else {
		223	return q;
		224	}
		225	});
		226
		227	if (changed)
156	{	228	{
157	prefix.front() = wildcardQuery;	229	if (tstats[term_prefix].count(f) == 0)
		230	{
		231	tstats[term_prefix].emplace(f, f);
		232	}
158		233
159	token_data& td2 = tstats[prefix][word];	234	token_data& td2 = tstats[term_prefix].at(f);
160	td2.word = word;
161	td2.all++;	235	td2.all++;
162		236
163	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())	237	if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164	{	238	{
165	td2.uppercase++;	239	td2.uppercase++;
166	} else if (isupper(f[0]))	240	} else if (isupper(f.raw[0]))
167	{	241	{
168	td2.titlecase++;	242	td2.titlecase++;
169	}	243	}
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171	}	245	}
172	}	246	}
173		247
174	for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)	248	// Condense the kgram distribution
		249	for (auto& it : tstats)
175	{	250	{
176	kgram klist = it->first;	251	kgram klist = it.first;
177	std::map<token, token_data>& probtable = it->second;	252	auto& probtable = it.second;
178	std::map<int, token_data>& distribution = stats[klist];	253	auto& distribution = stats[klist];
179	int max = 0;	254	int max = 0;
180		255
181	for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)	256	for (auto& kt : probtable)
182	{	257	{
183	max += kt->second.all;	258	max += kt.second.all;
184		259
185	distribution[max] = kt->second;	260	distribution.emplace(max, kt.second);
186	}
187	}
188
189	for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190	{
191	token word = it->first;
192	std::map<termstats, int>& probtable = it->second;
193	std::map<int, termstats>& distribution = endings[word];
194	int max = 0;
195
196	for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197	{
198	max += kt->second;
199
200	distribution[max] = kt->first;
201	}	261	}
202	}	262	}
203	}	263	}
204		264
205	void printKgram(kgram k)	265	void printKgram(kgram k)
206	{	266	{
207	for (kgram::iterator it = k.begin(); it != k.end(); it++)	267	for (auto& q : k)
208	{	268	{
209	query& q = *it;	269	if (q.type == querytype::sentence)
210	if (q.type == querytype_sentence)
211	{	270	{
212	std::cout << "#.# ";	271	std::cout << "#.# ";
213	} else if (q.type == querytype_literal)	272	} else if (q.type == querytype::literal)
214	{	273	{
215	if (q.word.type == tokentype_hashtag)	274	if (q.tok.suffix == suffixtype::terminating)
216	{	275	{
217	if (q.word.terminating)	276	std::cout << q.tok.w.canon << ". ";
218	{	277	} else if (q.tok.suffix == suffixtype::comma)
219	std::cout << "#hashtag. ";
220	} else {
221	std::cout << "#hashtag ";
222	}
223	} else if (q.word.type == tokentype_literal)
224	{	278	{
225	if (q.word.terminating)	279	std::cout << q.tok.w.canon << ", ";
226	{	280	} else {
227	std::cout << q.word.canon << ". ";	281	std::cout << q.tok.w.canon << " ";
228	} else {
229	std::cout << q.word.canon << " ";
230	}
231	}	282	}
232	}	283	}
233	}	284	}
234	}	285	}
235		286
236	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus	287	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237	std::vector<std::string> kgramstats::randomSentence(int n)	288	std::string kgramstats::randomSentence(int n)
238	{	289	{
239	std::vector<std::string> result;	290	std::string result;
240	kgram cur(1, wildcardQuery);	291	kgram cur(1, wildcardQuery);
241	int cuts = 0;	292	int cuts = 0;
		293	std::stack<parentype> open_delimiters;
242		294
243	for (int i=0; i<n; i++)	295	for (int i=0; i<n; i++)
244	{	296	{
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273	cur = kgram(1, wildcardQuery);	325	cur = kgram(1, wildcardQuery);
274	}	326	}
275		327
276	std::map<int, token_data>& distribution = stats[cur];	328	auto& distribution = stats[cur];
277	int max = distribution.rbegin()->first;	329	int max = distribution.rbegin()->first;
278	int r = rand() % max;	330	int r = rand() % max;
279	token_data& next = distribution.upper_bound(r)->second;	331	token_data& next = distribution.upper_bound(r)->second;
280	std::string nextToken;	332	std::string nextToken = next.tok.w.forms.next();
281	bool mess = false;	333
282		334	// Determine the casing of the next token. We randomly make the token all
283	if (next.word.type == tokentype_literal)	335	// caps based on the markov chain. Otherwise, we check if the previous
		336	// token is the end of a sentence (terminating token or a wildcard query).
		337	int casing = rand() % next.all;
		338	if (casing < next.uppercase)
		339	{
		340	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
		341	} else if ((((cur.rbegin()->type == querytype::sentence)
		342	\|\| ((cur.rbegin()->type == querytype::literal)
		343	&& (cur.rbegin()->tok.suffix == suffixtype::terminating)))
		344	&& (rand() % 2 > 0))
		345	\|\| (casing - next.uppercase < next.titlecase))
284	{	346	{
285	nextToken = next.word.canon;	347	nextToken[0] = toupper(nextToken[0]);
		348	}
286		349
287	mess = (rand() % 100) == 0;	350	// Delimiters
288	if (mess)	351	for (auto& dt : next.tok.delimiters)
		352	{
		353	if (dt.first.status == doublestatus::both)
289	{	354	{
290	nextToken = mstats.alternate(nextToken);	355	switch (dt.first.type)
291	}	356	{
292		357	case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293	// Determine the casing of the next token. We randomly make the token all	358	case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294	// caps based on the markov chain. Otherwise, we check if the previous	359	case parentype::asterisk: nextToken = std::string("", dt.second) + nextToken + std::string("", dt.second); break;
295	// token is the end of a sentence (terminating token or a wildcard query).	360	case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296	int casing = rand() % next.all;	361	}
297	if (casing < next.uppercase)	362	} else if (dt.first.status == doublestatus::opening)
298	{	363	{
299	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	364	for (int i=0; i<dt.second; i++)
300	} else if ((((cur.rbegin()->type == querytype_sentence)	365	{
301	\|\| ((cur.rbegin()->type == querytype_literal)	366	open_delimiters.push(dt.first.type);
302	&& (cur.rbegin()->word.terminating)))	367	}
303	&& (rand() % 2 > 0))	368
304	\|\| (casing - next.uppercase < next.titlecase))	369	switch (dt.first.type)
		370	{
		371	case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
		372	case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
		373	case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
		374	case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
		375	}
		376	} else if (dt.first.status == doublestatus::closing)
305	{	377	{
306	nextToken[0] = toupper(nextToken[0]);	378	for (int i=0; i<dt.second; i++)
		379	{
		380	while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
		381	{
		382	switch (open_delimiters.top())
		383	{
		384	case parentype::paren: nextToken.append(")"); break;
		385	case parentype::square_bracket: nextToken.append("]"); break;
		386	case parentype::asterisk: nextToken.append("*"); break;
		387	case parentype::quote: nextToken.append("\""); break;
		388	}
		389
		390	open_delimiters.pop();
		391	}
		392
		393	if (open_delimiters.empty())
		394	{
		395	switch (dt.first.type)
		396	{
		397	case parentype::paren: result = "(" + result; break;
		398	case parentype::square_bracket: result = "[" + result; break;
		399	case parentype::asterisk: result = "*" + result; break;
		400	case parentype::quote: result = "\"" + result; break;
		401	}
		402	}
		403
		404	switch (dt.first.type)
		405	{
		406	case parentype::paren: nextToken.append(")"); break;
		407	case parentype::square_bracket: nextToken.append("]"); break;
		408	case parentype::asterisk: nextToken.append("*"); break;
		409	case parentype::quote: nextToken.append("\""); break;
		410	}
		411	}
307	}	412	}
308	} else if (next.word.type == tokentype_hashtag)
309	{
310	int rhash = rand() % hashtags.size();
311	nextToken = hashtags[rhash];
312	}	413	}
313		414
314	if (next.word.terminating)	415	// Terminators
		416	if (next.tok.suffix == suffixtype::terminating)
315	{	417	{
316	std::map<int, termstats>& ending = endings[next.word];	418	nextToken.append(next.tok.w.terms.next());
317	int emax = ending.rbegin()->first;	419	} else if (next.tok.suffix == suffixtype::comma)
318	int er = rand() % emax;	420	{
319	termstats& nextend = ending.upper_bound(er)->second;	421	nextToken.append(",");
320
321	nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322	}	422	}
323		423
324	/* DEBUG */	424	/* DEBUG */
325	printKgram(cur);	425	printKgram(cur);
		426	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
		427
		428	cur.push_back(next.tok);
326		429
327	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";	430	result.append(nextToken + " ");
328		431
329	if (mess)	432	if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330	{	433	{
331	std::cout << " mala " << next.word.canon;	434	break;
332	}	435	}
333
334	std::cout << std::endl;
335
336	cur.push_back(next.word);
337
338	result.push_back(nextToken);
339	}	436	}
340
341	return result;
342	}
343
344	bool removeIf(char c)
345	{
346	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')/);
347	}
348
349	std::string canonize(std::string f)
350	{
351	std::string canonical(f);
352	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353		437
354	std::string result;	438	// Remove the trailing space
355	std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);	439	if (result.back() == ' ')
		440	{
		441	result.pop_back();
		442	}
		443
		444	// Close any open delimiters
		445	while (!open_delimiters.empty())
		446	{
		447	switch (open_delimiters.top())
		448	{
		449	case parentype::paren: result.append(")"); break;
		450	case parentype::square_bracket: result.append("]"); break;
		451	case parentype::asterisk: result.append("*"); break;
		452	case parentype::quote: result.append("\""); break;
		453	}
		454
		455	open_delimiters.pop();
		456	}
356		457
357	return result;	458	return result;
358	}	459	}