1 files changed, 277 insertions, 176 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 4bb7f15..0ab0c99 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -37,35 +37,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
-#include "malaprop.h"
+#include <set>
+#include <stack>
-query wildcardQuery(querytype_sentence);
+query wildcardQuery {querytype::sentence};
+word blank_word {""};
-std::string canonize(std::string f);
-token token_from_string(std::string in)
-{
-  if (in[0] == '#')
-  {
-    token word(tokentype_hashtag);
-    
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  } else {
-    token word(canonize(in));
-  
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  }
-}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
 {
  this->maxK = maxK;
  
-  std::vector<std::string> tokens;
+  std::vector<token> tokens;
  size_t start = 0;
  int end = 0;
  std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  {
    end = corpus.find(" ", start);
-    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    if (token[token.length()-1] == '\n')
+    if (t.compare("") && t.compare("."))
    {
-      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
+      std::string tc(t), canonical;
+      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
+        return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+      });
+      
+      word& w = ([&] () -> word& {
+        // Hashtag freevar
+        if (canonical[0] == '#')
+        {
+          thashtags.insert(canonical);
+          canonical = "#hashtag";
+          
+          return hashtags;
+        }
+        
+        // Basically any other word
+        if (words.count(canonical) == 0)
+        {
+          words.emplace(canonical, canonical);
+        }
+        
+        word& tw = words.at(canonical);
+        tw.forms.add(canonical);
+        
+        return tw;
+      })();
+      
+      token tk(w);
+      tk.raw = t;
+      
+      for (char c : t)
      {
-        token.insert(token.length()-1, ".");
+        if (c == '*')
+        {
+          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+        } else if (c == '[')
+        {
+          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+        } else if (c == '(')
+        {
+          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+        } else if (c == '"')
+        {
+          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+        } else {
+          break;
+        }
      }
-         
-      token.resize(token.length()-1);
-    }
-       
-    if (token.compare("") && token.compare("."))
-    {
-      mstats.addWord(token);
-      tokens.push_back(token);
      
-      if (token[0] == '#')
+      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+      if (backtrack != t.length())
      {
-        thashtags.insert(canonize(token));
+        std::string ending = t.substr(backtrack);
+        std::string suffix;
+        
+        for (char c : ending)
+        {
+          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          {
+            suffix += c;
+            
+            continue;
+          } else if (c == '\n')
+          {
+            // At least the end is coming
+            if (suffix.empty())
+            {
+              suffix = ".";
+            }
+            
+            break;
+          }
+          
+          parentype pt = ([&] {
+            switch (c)
+            {
+              case ']': return parentype::square_bracket;
+              case ')': return parentype::paren;
+              case '*': return parentype::asterisk;
+              case '"': return parentype::quote;
+            }
+          })();
+          
+          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
+          {
+            tk.delimiters[{pt, doublestatus::opening}]--;
+            tk.delimiters[{pt, doublestatus::both}]++;
+          } else {
+            tk.delimiters[{pt, doublestatus::closing}]++;
+          }
+        }
+        
+        if (suffix == ",")
+        {
+          tk.suffix = suffixtype::comma;
+        } else if (!suffix.empty()) {
+          tk.suffix = suffixtype::terminating;
+          
+          w.terms.add(suffix);
+        }
      }
+      
+      tokens.push_back(tk);
    }
    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
  }
  
-  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  // Time to condense the distribution stuff for the words
+  for (auto& it : words)
  {
-    hashtags.push_back(*it);
+    it.second.forms.compile();
+    it.second.terms.compile();
  }
-        
+  
+  // Hashtag freevar is not frequency distributed
+  for (auto& it : thashtags)
+  {
+    hashtags.forms.add(it);
+  }
+  
+  hashtags.forms.compile();
+  hashtags.terms.compile();
+  // kgram distribution
  std::map<kgram, std::map<token, token_data> > tstats;
-  std::map<token, std::map<termstats, int> > tendings;
  for (int k=1; k<maxK; k++)
  {
    for (int i=0; i<(tokens.size() - k); i++)
    {
-      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
-      kgram prefix;
+      token f = tokens[i+k];
-      
-      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
+      if (tstats[prefix].count(f) == 0)
-      {
-        prefix.push_back(token_from_string(*it));
-      }
-      
-      std::string f = tokens[i+k];
-                  std::string canonical = canonize(f);
-      
-      token word(token_from_string(canonical));
-      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        word.terminating = true;
+        tstats[prefix].emplace(f, f);
-        
-        char terminator = f[f.find_last_of(".?!,")];
-        int occurrences = std::count(f.begin(), f.end(), terminator);
-        
-        tendings[word][termstats(terminator, occurrences)]++;
      }
                        
-      token_data& td = tstats[prefix][word];
+      token_data& td = tstats[prefix].at(f);
-      td.word = word;
      td.all++;
-      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td.uppercase++;
-      } else if (isupper(f[0]))
+      } else if (isupper(f.raw[0]))
      {
        td.titlecase++;
      }
      
-      if (prefix.front().word.terminating)
+      kgram term_prefix;
+      bool changed = false;
+      std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
+        if (q.tok.suffix == suffixtype::terminating)
+        {
+          changed = true;
+          
+          return wildcardQuery;
+        } else {
+          return q;
+        }
+      });
+      
+      if (changed)
      {
-        prefix.front() = wildcardQuery;
+        if (tstats[term_prefix].count(f) == 0)
+        {
+          tstats[term_prefix].emplace(f, f);
+        }
        
-        token_data& td2 = tstats[prefix][word];
+        token_data& td2 = tstats[term_prefix].at(f);
-        td2.word = word;
        td2.all++;
-        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td2.uppercase++;
-        } else if (isupper(f[0]))
+        } else if (isupper(f.raw[0]))
        {
          td2.titlecase++;
        }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    }
  }
        
-  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  // Condense the kgram distribution
+  for (auto& it : tstats)
  {
-    kgram klist = it->first;
+    kgram klist = it.first;
-    std::map<token, token_data>& probtable = it->second;
+    auto& probtable = it.second;
-    std::map<int, token_data>& distribution = stats[klist];
+    auto& distribution = stats[klist];
    int max = 0;
                
-    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    for (auto& kt : probtable)
    {
-      max += kt->second.all;
+      max += kt.second.all;
                        
-      distribution[max] = kt->second;
+      distribution.emplace(max, kt.second);
-    }
-  }
-  
-  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
-  {
-    token word = it->first;
-    std::map<termstats, int>& probtable = it->second;
-    std::map<int, termstats>& distribution = endings[word];
-    int max = 0;
-    
-    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-    {
-      max += kt->second;
-      
-      distribution[max] = kt->first;
    }
  }
 }
 void printKgram(kgram k)
 {
-  for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (auto& q : k)
  {
-    query& q = *it;
+    if (q.type == querytype::sentence)
-    if (q.type == querytype_sentence)
    {
      std::cout << "#.# ";
-    } else if (q.type == querytype_literal)
+    } else if (q.type == querytype::literal)
    {
-      if (q.word.type == tokentype_hashtag)
+      if (q.tok.suffix == suffixtype::terminating)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ". ";
-        {
+      } else if (q.tok.suffix == suffixtype::comma)
-          std::cout << "#hashtag. ";
-        } else {
-          std::cout << "#hashtag ";
-        }
-      } else if (q.word.type == tokentype_literal)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ", ";
-        {
+      } else {
-          std::cout << q.word.canon << ". ";
+        std::cout << q.tok.w.canon << " ";
-        } else {
-          std::cout << q.word.canon << " ";
-        }
      }
    }
  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::vector<std::string> kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int n)
 {
-  std::vector<std::string> result;
+  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
+  std::stack<parentype> open_delimiters;
        
  for (int i=0; i<n; i++)
  {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      cur = kgram(1, wildcardQuery);
    }
-    std::map<int, token_data>& distribution = stats[cur];
+    auto& distribution = stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken;
+    std::string nextToken = next.tok.w.forms.next();
-    bool mess = false;
+  
-    
+    // Determine the casing of the next token. We randomly make the token all
-    if (next.word.type == tokentype_literal)
+    // caps based on the markov chain. Otherwise, we check if the previous
+    // token is the end of a sentence (terminating token or a wildcard query).
+    int casing = rand() % next.all;
+    if (casing < next.uppercase)
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype::sentence)
+          || ((cur.rbegin()->type == querytype::literal)
+            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
-      nextToken = next.word.canon;
+      nextToken[0] = toupper(nextToken[0]);
+    }
    
-      mess = (rand() % 100) == 0;
+    // Delimiters
-      if (mess)
+    for (auto& dt : next.tok.delimiters)
+    {
+      if (dt.first.status == doublestatus::both)
      {
-        nextToken = mstats.alternate(nextToken);
+        switch (dt.first.type)
-      }
+        {
-    
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
-      // Determine the casing of the next token. We randomly make the token all
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
-      // caps based on the markov chain. Otherwise, we check if the previous
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
-      // token is the end of a sentence (terminating token or a wildcard query).
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
-      int casing = rand() % next.all;
+        }
-      if (casing < next.uppercase)
+      } else if (dt.first.status == doublestatus::opening)
      {
-        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+        for (int i=0; i<dt.second; i++)
-      } else if ((((cur.rbegin()->type == querytype_sentence)
+        {
-            || ((cur.rbegin()->type == querytype_literal)
+          open_delimiters.push(dt.first.type);
-              && (cur.rbegin()->word.terminating)))
+        }
-          && (rand() % 2 > 0))
+        
-        || (casing - next.uppercase < next.titlecase))
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
+        }
+      } else if (dt.first.status == doublestatus::closing)
      {
-        nextToken[0] = toupper(nextToken[0]);
+        for (int i=0; i<dt.second; i++)
+        {
+          while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
+          {
+            switch (open_delimiters.top())
+            {
+              case parentype::paren: nextToken.append(")"); break;
+              case parentype::square_bracket: nextToken.append("]"); break;
+              case parentype::asterisk: nextToken.append("*"); break;
+              case parentype::quote: nextToken.append("\""); break;
+            }
+            
+            open_delimiters.pop();
+          }
+          
+          if (open_delimiters.empty())
+          {
+            switch (dt.first.type)
+            {
+              case parentype::paren: result = "(" + result; break;
+              case parentype::square_bracket: result = "[" + result; break;
+              case parentype::asterisk: result = "*" + result; break;
+              case parentype::quote: result = "\"" + result; break;
+            }
+          }
+          
+          switch (dt.first.type)
+          {
+            case parentype::paren: nextToken.append(")"); break;
+            case parentype::square_bracket: nextToken.append("]"); break;
+            case parentype::asterisk: nextToken.append("*"); break;
+            case parentype::quote: nextToken.append("\""); break;
+          }
+        }
      }
-    } else if (next.word.type == tokentype_hashtag)
-    {
-      int rhash = rand() % hashtags.size();
-      nextToken = hashtags[rhash];
    }
    
-    if (next.word.terminating)
+    // Terminators
+    if (next.tok.suffix == suffixtype::terminating)
    {
-      std::map<int, termstats>& ending = endings[next.word];
+      nextToken.append(next.tok.w.terms.next());
-      int emax = ending.rbegin()->first;
+    } else if (next.tok.suffix == suffixtype::comma)
-      int er = rand() % emax;
+    {
-      termstats& nextend = ending.upper_bound(er)->second;
+      nextToken.append(",");
-    
-      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
                
    /* DEBUG */
    printKgram(cur);
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
+    cur.push_back(next.tok);
                
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
+    result.append(nextToken + " ");
    
-    if (mess)
+    if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
    {
-      std::cout << " mala " << next.word.canon;
+      break;
    }
-    
-    std::cout << std::endl;
-    cur.push_back(next.word);
-                
-    result.push_back(nextToken);
  }
-        
-  return result;
-}
-bool removeIf(char c)
-{
-  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
-}
-std::string canonize(std::string f)
-{
-  std::string canonical(f);
-  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  std::string result;
+  // Remove the trailing space
-  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  if (result.back() == ' ')
+  {
+    result.pop_back();
+  }
+  
+  // Close any open delimiters
+  while (!open_delimiters.empty())
+  {
+    switch (open_delimiters.top())
+    {
+      case parentype::paren: result.append(")"); break;
+      case parentype::square_bracket: result.append("]"); break;
+      case parentype::asterisk: result.append("*"); break;
+      case parentype::quote: result.append("\""); break;
+    }
+    
+    open_delimiters.pop();
+  }
        
  return result;
 }

diff --git a/kgramstats.cpp b/kgramstats.cpp index 4bb7f15..0ab0c99 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
37	#include <iostream>	37	#include <iostream>
38	#include <cstdlib>	38	#include <cstdlib>
39	#include <algorithm>	39	#include <algorithm>
40	#include "malaprop.h"	40	#include <set>
		41	#include <stack>
41		42
42	query wildcardQuery(querytype_sentence);	43	query wildcardQuery {querytype::sentence};
43		44	word blank_word {""};
44	std::string canonize(std::string f);
45
46	token token_from_string(std::string in)
47	{
48	if (in[0] == '#')
49	{
50	token word(tokentype_hashtag);
51
52	if (in.find_first_of(".?!,") != std::string::npos)
53	{
54	word.terminating = true;
55	}
56
57	return word;
58	} else {
59	token word(canonize(in));
60
61	if (in.find_first_of(".?!,") != std::string::npos)
62	{
63	word.terminating = true;
64	}
65
66	return word;
67	}
68	}
69		45
70	// runs in O(t^2) time where t is the number of tokens in the input corpus	46	// runs in O(t^2) time where t is the number of tokens in the input corpus
71	// We consider maxK to be fairly constant	47	// We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
73	{	49	{
74	this->maxK = maxK;	50	this->maxK = maxK;
75		51
76	std::vector<std::string> tokens;	52	std::vector<token> tokens;
77	size_t start = 0;	53	size_t start = 0;
78	int end = 0;	54	int end = 0;
79	std::set<std::string> thashtags;	55	std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
82	{	58	{
83	end = corpus.find(" ", start);	59	end = corpus.find(" ", start);
84		60
85	std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);	61	std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
86	if (token[token.length()-1] == '\n')	62	if (t.compare("") && t.compare("."))
87	{	63	{
88	if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))	64	std::string tc(t), canonical;
		65	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
		66	std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
		67	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
		68	});
		69
		70	word& w = ([&] () -> word& {
		71	// Hashtag freevar
		72	if (canonical[0] == '#')
		73	{
		74	thashtags.insert(canonical);
		75	canonical = "#hashtag";
		76
		77	return hashtags;
		78	}
		79
		80	// Basically any other word
		81	if (words.count(canonical) == 0)
		82	{
		83	words.emplace(canonical, canonical);
		84	}
		85
		86	word& tw = words.at(canonical);
		87	tw.forms.add(canonical);
		88
		89	return tw;
		90	})();
		91
		92	token tk(w);
		93	tk.raw = t;
		94
		95	for (char c : t)
89	{	96	{
90	token.insert(token.length()-1, ".");	97	if (c == '*')
		98	{
		99	tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
		100	} else if (c == '[')
		101	{
		102	tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
		103	} else if (c == '(')
		104	{
		105	tk.delimiters[{parentype::paren, doublestatus::opening}]++;
		106	} else if (c == '"')
		107	{
		108	tk.delimiters[{parentype::quote, doublestatus::opening}]++;
		109	} else {
		110	break;
		111	}
91	}	112	}
92
93	token.resize(token.length()-1);
94	}
95
96	if (token.compare("") && token.compare("."))
97	{
98	mstats.addWord(token);
99	tokens.push_back(token);
100		113
101	if (token[0] == '#')	114	int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
		115	if (backtrack != t.length())
102	{	116	{
103	thashtags.insert(canonize(token));	117	std::string ending = t.substr(backtrack);
		118	std::string suffix;
		119
		120	for (char c : ending)
		121	{
		122	if ((c == '.') \|\| (c == ',') \|\| (c == '?') \|\| (c == '!'))
		123	{
		124	suffix += c;
		125
		126	continue;
		127	} else if (c == '\n')
		128	{
		129	// At least the end is coming
		130	if (suffix.empty())
		131	{
		132	suffix = ".";
		133	}
		134
		135	break;
		136	}
		137
		138	parentype pt = ([&] {
		139	switch (c)
		140	{
		141	case ']': return parentype::square_bracket;
		142	case ')': return parentype::paren;
		143	case '*': return parentype::asterisk;
		144	case '"': return parentype::quote;
		145	}
		146	})();
		147
		148	if (tk.delimiters[{pt, doublestatus::opening}] > 0)
		149	{
		150	tk.delimiters[{pt, doublestatus::opening}]--;
		151	tk.delimiters[{pt, doublestatus::both}]++;
		152	} else {
		153	tk.delimiters[{pt, doublestatus::closing}]++;
		154	}
		155	}
		156
		157	if (suffix == ",")
		158	{
		159	tk.suffix = suffixtype::comma;
		160	} else if (!suffix.empty()) {
		161	tk.suffix = suffixtype::terminating;
		162
		163	w.terms.add(suffix);
		164	}
104	}	165	}
		166
		167	tokens.push_back(tk);
105	}	168	}
106		169
107	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	170	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
108	}	171	}
109		172
110	for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)	173	// Time to condense the distribution stuff for the words
		174	for (auto& it : words)
111	{	175	{
112	hashtags.push_back(*it);	176	it.second.forms.compile();
		177	it.second.terms.compile();
113	}	178	}
114		179
		180	// Hashtag freevar is not frequency distributed
		181	for (auto& it : thashtags)
		182	{
		183	hashtags.forms.add(it);
		184	}
		185
		186	hashtags.forms.compile();
		187	hashtags.terms.compile();
		188
		189	// kgram distribution
115	std::map<kgram, std::map<token, token_data> > tstats;	190	std::map<kgram, std::map<token, token_data> > tstats;
116	std::map<token, std::map<termstats, int> > tendings;
117	for (int k=1; k<maxK; k++)	191	for (int k=1; k<maxK; k++)
118	{	192	{
119	for (int i=0; i<(tokens.size() - k); i++)	193	for (int i=0; i<(tokens.size() - k); i++)
120	{	194	{
121	std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);	195	kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
122	kgram prefix;	196	token f = tokens[i+k];
123		197
124	for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)	198	if (tstats[prefix].count(f) == 0)
125	{
126	prefix.push_back(token_from_string(*it));
127	}
128
129	std::string f = tokens[i+k];
130	std::string canonical = canonize(f);
131
132	token word(token_from_string(canonical));
133	if (f.find_first_of(".?!,") != std::string::npos)
134	{	199	{
135	word.terminating = true;	200	tstats[prefix].emplace(f, f);
136
137	char terminator = f[f.find_last_of(".?!,")];
138	int occurrences = std::count(f.begin(), f.end(), terminator);
139
140	tendings[word][termstats(terminator, occurrences)]++;
141	}	201	}
142		202
143	token_data& td = tstats[prefix][word];	203	token_data& td = tstats[prefix].at(f);
144	td.word = word;
145	td.all++;	204	td.all++;
146		205
147	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())	206	if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
148	{	207	{
149	td.uppercase++;	208	td.uppercase++;
150	} else if (isupper(f[0]))	209	} else if (isupper(f.raw[0]))
151	{	210	{
152	td.titlecase++;	211	td.titlecase++;
153	}	212	}
154		213
155	if (prefix.front().word.terminating)	214	kgram term_prefix;
		215	bool changed = false;
		216	std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
		217	if (q.tok.suffix == suffixtype::terminating)
		218	{
		219	changed = true;
		220
		221	return wildcardQuery;
		222	} else {
		223	return q;
		224	}
		225	});
		226
		227	if (changed)
156	{	228	{
157	prefix.front() = wildcardQuery;	229	if (tstats[term_prefix].count(f) == 0)
		230	{
		231	tstats[term_prefix].emplace(f, f);
		232	}
158		233
159	token_data& td2 = tstats[prefix][word];	234	token_data& td2 = tstats[term_prefix].at(f);
160	td2.word = word;
161	td2.all++;	235	td2.all++;
162		236
163	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())	237	if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
164	{	238	{
165	td2.uppercase++;	239	td2.uppercase++;
166	} else if (isupper(f[0]))	240	} else if (isupper(f.raw[0]))
167	{	241	{
168	td2.titlecase++;	242	td2.titlecase++;
169	}	243	}
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171	}	245	}
172	}	246	}
173		247
174	for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)	248	// Condense the kgram distribution
		249	for (auto& it : tstats)
175	{	250	{
176	kgram klist = it->first;	251	kgram klist = it.first;
177	std::map<token, token_data>& probtable = it->second;	252	auto& probtable = it.second;
178	std::map<int, token_data>& distribution = stats[klist];	253	auto& distribution = stats[klist];
179	int max = 0;	254	int max = 0;
180		255
181	for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)	256	for (auto& kt : probtable)
182	{	257	{
183	max += kt->second.all;	258	max += kt.second.all;
184		259
185	distribution[max] = kt->second;	260	distribution.emplace(max, kt.second);
186	}
187	}
188
189	for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
190	{
191	token word = it->first;
192	std::map<termstats, int>& probtable = it->second;
193	std::map<int, termstats>& distribution = endings[word];
194	int max = 0;
195
196	for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
197	{
198	max += kt->second;
199
200	distribution[max] = kt->first;
201	}	261	}
202	}	262	}
203	}	263	}
204		264
205	void printKgram(kgram k)	265	void printKgram(kgram k)
206	{	266	{
207	for (kgram::iterator it = k.begin(); it != k.end(); it++)	267	for (auto& q : k)
208	{	268	{
209	query& q = *it;	269	if (q.type == querytype::sentence)
210	if (q.type == querytype_sentence)
211	{	270	{
212	std::cout << "#.# ";	271	std::cout << "#.# ";
213	} else if (q.type == querytype_literal)	272	} else if (q.type == querytype::literal)
214	{	273	{
215	if (q.word.type == tokentype_hashtag)	274	if (q.tok.suffix == suffixtype::terminating)
216	{	275	{
217	if (q.word.terminating)	276	std::cout << q.tok.w.canon << ". ";
218	{	277	} else if (q.tok.suffix == suffixtype::comma)
219	std::cout << "#hashtag. ";
220	} else {
221	std::cout << "#hashtag ";
222	}
223	} else if (q.word.type == tokentype_literal)
224	{	278	{
225	if (q.word.terminating)	279	std::cout << q.tok.w.canon << ", ";
226	{	280	} else {
227	std::cout << q.word.canon << ". ";	281	std::cout << q.tok.w.canon << " ";
228	} else {
229	std::cout << q.word.canon << " ";
230	}
231	}	282	}
232	}	283	}
233	}	284	}
234	}	285	}
235		286
236	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus	287	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
237	std::vector<std::string> kgramstats::randomSentence(int n)	288	std::string kgramstats::randomSentence(int n)
238	{	289	{
239	std::vector<std::string> result;	290	std::string result;
240	kgram cur(1, wildcardQuery);	291	kgram cur(1, wildcardQuery);
241	int cuts = 0;	292	int cuts = 0;
		293	std::stack<parentype> open_delimiters;
242		294
243	for (int i=0; i<n; i++)	295	for (int i=0; i<n; i++)
244	{	296	{
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
273	cur = kgram(1, wildcardQuery);	325	cur = kgram(1, wildcardQuery);
274	}	326	}
275		327
276	std::map<int, token_data>& distribution = stats[cur];	328	auto& distribution = stats[cur];
277	int max = distribution.rbegin()->first;	329	int max = distribution.rbegin()->first;
278	int r = rand() % max;	330	int r = rand() % max;
279	token_data& next = distribution.upper_bound(r)->second;	331	token_data& next = distribution.upper_bound(r)->second;
280	std::string nextToken;	332	std::string nextToken = next.tok.w.forms.next();
281	bool mess = false;	333
282		334	// Determine the casing of the next token. We randomly make the token all
283	if (next.word.type == tokentype_literal)	335	// caps based on the markov chain. Otherwise, we check if the previous
		336	// token is the end of a sentence (terminating token or a wildcard query).
		337	int casing = rand() % next.all;
		338	if (casing < next.uppercase)
		339	{
		340	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
		341	} else if ((((cur.rbegin()->type == querytype::sentence)
		342	\|\| ((cur.rbegin()->type == querytype::literal)
		343	&& (cur.rbegin()->tok.suffix == suffixtype::terminating)))
		344	&& (rand() % 2 > 0))
		345	\|\| (casing - next.uppercase < next.titlecase))
284	{	346	{
285	nextToken = next.word.canon;	347	nextToken[0] = toupper(nextToken[0]);
		348	}
286		349
287	mess = (rand() % 100) == 0;	350	// Delimiters
288	if (mess)	351	for (auto& dt : next.tok.delimiters)
		352	{
		353	if (dt.first.status == doublestatus::both)
289	{	354	{
290	nextToken = mstats.alternate(nextToken);	355	switch (dt.first.type)
291	}	356	{
292		357	case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
293	// Determine the casing of the next token. We randomly make the token all	358	case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
294	// caps based on the markov chain. Otherwise, we check if the previous	359	case parentype::asterisk: nextToken = std::string("", dt.second) + nextToken + std::string("", dt.second); break;
295	// token is the end of a sentence (terminating token or a wildcard query).	360	case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
296	int casing = rand() % next.all;	361	}
297	if (casing < next.uppercase)	362	} else if (dt.first.status == doublestatus::opening)
298	{	363	{
299	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	364	for (int i=0; i<dt.second; i++)
300	} else if ((((cur.rbegin()->type == querytype_sentence)	365	{
301	\|\| ((cur.rbegin()->type == querytype_literal)	366	open_delimiters.push(dt.first.type);
302	&& (cur.rbegin()->word.terminating)))	367	}
303	&& (rand() % 2 > 0))	368
304	\|\| (casing - next.uppercase < next.titlecase))	369	switch (dt.first.type)
		370	{
		371	case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
		372	case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
		373	case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
		374	case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
		375	}
		376	} else if (dt.first.status == doublestatus::closing)
305	{	377	{
306	nextToken[0] = toupper(nextToken[0]);	378	for (int i=0; i<dt.second; i++)
		379	{
		380	while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
		381	{
		382	switch (open_delimiters.top())
		383	{
		384	case parentype::paren: nextToken.append(")"); break;
		385	case parentype::square_bracket: nextToken.append("]"); break;
		386	case parentype::asterisk: nextToken.append("*"); break;
		387	case parentype::quote: nextToken.append("\""); break;
		388	}
		389
		390	open_delimiters.pop();
		391	}
		392
		393	if (open_delimiters.empty())
		394	{
		395	switch (dt.first.type)
		396	{
		397	case parentype::paren: result = "(" + result; break;
		398	case parentype::square_bracket: result = "[" + result; break;
		399	case parentype::asterisk: result = "*" + result; break;
		400	case parentype::quote: result = "\"" + result; break;
		401	}
		402	}
		403
		404	switch (dt.first.type)
		405	{
		406	case parentype::paren: nextToken.append(")"); break;
		407	case parentype::square_bracket: nextToken.append("]"); break;
		408	case parentype::asterisk: nextToken.append("*"); break;
		409	case parentype::quote: nextToken.append("\""); break;
		410	}
		411	}
307	}	412	}
308	} else if (next.word.type == tokentype_hashtag)
309	{
310	int rhash = rand() % hashtags.size();
311	nextToken = hashtags[rhash];
312	}	413	}
313		414
314	if (next.word.terminating)	415	// Terminators
		416	if (next.tok.suffix == suffixtype::terminating)
315	{	417	{
316	std::map<int, termstats>& ending = endings[next.word];	418	nextToken.append(next.tok.w.terms.next());
317	int emax = ending.rbegin()->first;	419	} else if (next.tok.suffix == suffixtype::comma)
318	int er = rand() % emax;	420	{
319	termstats& nextend = ending.upper_bound(er)->second;	421	nextToken.append(",");
320
321	nextToken.append(std::string(nextend.occurrences, nextend.terminator));
322	}	422	}
323		423
324	/* DEBUG */	424	/* DEBUG */
325	printKgram(cur);	425	printKgram(cur);
		426	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
		427
		428	cur.push_back(next.tok);
326		429
327	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";	430	result.append(nextToken + " ");
328		431
329	if (mess)	432	if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
330	{	433	{
331	std::cout << " mala " << next.word.canon;	434	break;
332	}	435	}
333
334	std::cout << std::endl;
335
336	cur.push_back(next.word);
337
338	result.push_back(nextToken);
339	}	436	}
340
341	return result;
342	}
343
344	bool removeIf(char c)
345	{
346	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')/);
347	}
348
349	std::string canonize(std::string f)
350	{
351	std::string canonical(f);
352	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
353		437
354	std::string result;	438	// Remove the trailing space
355	std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);	439	if (result.back() == ' ')
		440	{
		441	result.pop_back();
		442	}
		443
		444	// Close any open delimiters
		445	while (!open_delimiters.empty())
		446	{
		447	switch (open_delimiters.top())
		448	{
		449	case parentype::paren: result.append(")"); break;
		450	case parentype::square_bracket: result.append("]"); break;
		451	case parentype::asterisk: result.append("*"); break;
		452	case parentype::quote: result.append("\""); break;
		453	}
		454
		455	open_delimiters.pop();
		456	}
356		457
357	return result;	458	return result;
358	}	459	}