From b316e309559d7176af6cf0bb7dcd6dbaa83c01cd Mon Sep 17 00:00:00 2001
From: Kelly Rauchenberger <fefferburbia@gmail.com>
Date: Fri, 29 Jan 2016 12:43:00 -0500
Subject: Rewrote how tokens are handled

A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags.

Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
---
 CMakeLists.txt |   8 +-
 ebooks.cpp     |  15 +-
 freevars.cpp   |   4 +-
 gen.cpp        |  15 +-
 histogram.cpp  |  34 +++++
 histogram.h    |  19 +++
 kgramstats.cpp | 453 +++++++++++++++++++++++++++++++++++----------------------
 kgramstats.h   | 124 ++++++++--------
 8 files changed, 406 insertions(+), 266 deletions(-)
 create mode 100644 histogram.cpp
 create mode 100644 histogram.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa63a34..41c4552 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,10 +8,14 @@ find_package(curl)
 if (YamlCpp_FOUND AND CURL_FOUND)
   add_subdirectory(vendor/twitcurl/libtwitcurl)
   include_directories(vendor/twitcurl/libtwitcurl)
-  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
   target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
 else (YamlCpp_FOUND AND CURL_FOUND)
   message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
 endif (YamlCpp_FOUND AND CURL_FOUND)
 
-add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp
index e38ebab..ed1e080 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp
@@ -44,20 +44,9 @@ int main(int argc, char** args)
   std::cout << "Generating..." << std::endl;
   for (;;)
   {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
-    std::string hi;
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
-
+    std::string doc = stats->randomSentence(rand() % 45 + 5);
+    std::string hi = vars->parse(doc);
     hi.resize(140);
-
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
     
     std::string replyMsg;
     if (twitter.statusUpdate(hi))
diff --git a/freevars.cpp b/freevars.cpp
index 8c3eda4..54c5aab 100644
--- a/freevars.cpp
+++ b/freevars.cpp
@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in)
     for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
     {
         std::string tofind = "$" + it->first + "$";
-        size_t fpos = res.find(tofind);
-        if (fpos != std::string::npos)
+        size_t fpos;
+        while ((fpos = res.find(tofind)) != std::string::npos)
         {
             int r = rand() % it->second->size();
             res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
diff --git a/gen.cpp b/gen.cpp
index 400c0a5..a0ef8e3 100644
--- a/gen.cpp
+++ b/gen.cpp
@@ -52,21 +52,10 @@ int main(int argc, char** args)
   std::cout << "Generating..." << std::endl;
   for (;;)
   {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
-    std::string hi;
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
-    
+    std::string doc = stats->randomSentence(rand() % 35 + 15);
+    std::string hi = vars->parse(doc);
     hi.resize(140);
 
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
-
     std::cout << hi << std::endl;
 		
     getc(stdin);
diff --git a/histogram.cpp b/histogram.cpp
new file mode 100644
index 0000000..6896146
--- /dev/null
+++ b/histogram.cpp
@@ -0,0 +1,34 @@
+#include "histogram.h"
+#include <cstdlib>
+
+template <class T>
+void histogram<T>::add(const T& inst)
+{
+  freqtable[inst]++;
+}
+
+template <class T>
+void histogram<T>::compile()
+{
+  distribution.clear();
+  
+  int max = 0;
+  for (auto& it : freqtable)
+  {
+    max += it.second;
+    distribution.emplace(max, it.first);
+  }
+  
+  freqtable.clear();
+}
+
+template <class T>
+const T& histogram<T>::next() const
+{
+  int max = distribution.rbegin()->first;
+  int r = rand() % max;
+  
+  return distribution.upper_bound(r)->second;
+}
+
+template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h
new file mode 100644
index 0000000..5aa2560
--- /dev/null
+++ b/histogram.h
@@ -0,0 +1,19 @@
+#ifndef HISTOGRAM_H_24094D97
+#define HISTOGRAM_H_24094D97
+
+#include <map>
+#include <string>
+
+template <class T>
+class histogram {
+  public:
+    void add(const T& inst);
+    void compile();
+    const T& next() const;
+    
+  private:
+    std::map<T, int> freqtable;
+    std::map<int, T> distribution;
+};
+
+#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 4bb7f15..0ab0c99 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp
@@ -37,35 +37,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
-#include "malaprop.h"
+#include <set>
+#include <stack>
 
-query wildcardQuery(querytype_sentence);
-
-std::string canonize(std::string f);
-
-token token_from_string(std::string in)
-{
-  if (in[0] == '#')
-  {
-    token word(tokentype_hashtag);
-    
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  } else {
-    token word(canonize(in));
-  
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  }
-}
+query wildcardQuery {querytype::sentence};
+word blank_word {""};
 
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
 {
   this->maxK = maxK;
   
-  std::vector<std::string> tokens;
+  std::vector<token> tokens;
   size_t start = 0;
   int end = 0;
   std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
   {
     end = corpus.find(" ", start);
 
-    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    if (token[token.length()-1] == '\n')
+    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    if (t.compare("") && t.compare("."))
     {
-      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
+      std::string tc(t), canonical;
+      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
+        return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+      });
+      
+      word& w = ([&] () -> word& {
+        // Hashtag freevar
+        if (canonical[0] == '#')
+        {
+          thashtags.insert(canonical);
+          canonical = "#hashtag";
+          
+          return hashtags;
+        }
+        
+        // Basically any other word
+        if (words.count(canonical) == 0)
+        {
+          words.emplace(canonical, canonical);
+        }
+        
+        word& tw = words.at(canonical);
+        tw.forms.add(canonical);
+        
+        return tw;
+      })();
+      
+      token tk(w);
+      tk.raw = t;
+      
+      for (char c : t)
       {
-        token.insert(token.length()-1, ".");
+        if (c == '*')
+        {
+          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+        } else if (c == '[')
+        {
+          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+        } else if (c == '(')
+        {
+          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+        } else if (c == '"')
+        {
+          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+        } else {
+          break;
+        }
       }
-         
-      token.resize(token.length()-1);
-    }
-       
-    if (token.compare("") && token.compare("."))
-    {
-      mstats.addWord(token);
-      tokens.push_back(token);
       
-      if (token[0] == '#')
+      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+      if (backtrack != t.length())
       {
-        thashtags.insert(canonize(token));
+        std::string ending = t.substr(backtrack);
+        std::string suffix;
+        
+        for (char c : ending)
+        {
+          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          {
+            suffix += c;
+            
+            continue;
+          } else if (c == '\n')
+          {
+            // At least the end is coming
+            if (suffix.empty())
+            {
+              suffix = ".";
+            }
+            
+            break;
+          }
+          
+          parentype pt = ([&] {
+            switch (c)
+            {
+              case ']': return parentype::square_bracket;
+              case ')': return parentype::paren;
+              case '*': return parentype::asterisk;
+              case '"': return parentype::quote;
+            }
+          })();
+          
+          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
+          {
+            tk.delimiters[{pt, doublestatus::opening}]--;
+            tk.delimiters[{pt, doublestatus::both}]++;
+          } else {
+            tk.delimiters[{pt, doublestatus::closing}]++;
+          }
+        }
+        
+        if (suffix == ",")
+        {
+          tk.suffix = suffixtype::comma;
+        } else if (!suffix.empty()) {
+          tk.suffix = suffixtype::terminating;
+          
+          w.terms.add(suffix);
+        }
       }
+      
+      tokens.push_back(tk);
     }
 
     start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
   }
   
-  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  // Time to condense the distribution stuff for the words
+  for (auto& it : words)
   {
-    hashtags.push_back(*it);
+    it.second.forms.compile();
+    it.second.terms.compile();
   }
-	
+  
+  // Hashtag freevar is not frequency distributed
+  for (auto& it : thashtags)
+  {
+    hashtags.forms.add(it);
+  }
+  
+  hashtags.forms.compile();
+  hashtags.terms.compile();
+
+  // kgram distribution
   std::map<kgram, std::map<token, token_data> > tstats;
-  std::map<token, std::map<termstats, int> > tendings;
   for (int k=1; k<maxK; k++)
   {
     for (int i=0; i<(tokens.size() - k); i++)
     {
-      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
-      kgram prefix;
-      
-      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
-      {
-        prefix.push_back(token_from_string(*it));
-      }
-      
-      std::string f = tokens[i+k];
-		  std::string canonical = canonize(f);
-      
-      token word(token_from_string(canonical));
-      if (f.find_first_of(".?!,") != std::string::npos)
+      kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
+      token f = tokens[i+k];
+
+      if (tstats[prefix].count(f) == 0)
       {
-        word.terminating = true;
-        
-        char terminator = f[f.find_last_of(".?!,")];
-        int occurrences = std::count(f.begin(), f.end(), terminator);
-        
-        tendings[word][termstats(terminator, occurrences)]++;
+        tstats[prefix].emplace(f, f);
       }
 			
-      token_data& td = tstats[prefix][word];
-      td.word = word;
+      token_data& td = tstats[prefix].at(f);
       td.all++;
 
-      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
       {
         td.uppercase++;
-      } else if (isupper(f[0]))
+      } else if (isupper(f.raw[0]))
       {
         td.titlecase++;
       }
       
-      if (prefix.front().word.terminating)
+      kgram term_prefix;
+      bool changed = false;
+      std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
+        if (q.tok.suffix == suffixtype::terminating)
+        {
+          changed = true;
+          
+          return wildcardQuery;
+        } else {
+          return q;
+        }
+      });
+      
+      if (changed)
       {
-        prefix.front() = wildcardQuery;
+        if (tstats[term_prefix].count(f) == 0)
+        {
+          tstats[term_prefix].emplace(f, f);
+        }
         
-        token_data& td2 = tstats[prefix][word];
-        td2.word = word;
+        token_data& td2 = tstats[term_prefix].at(f);
         td2.all++;
 
-        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
         {
           td2.uppercase++;
-        } else if (isupper(f[0]))
+        } else if (isupper(f.raw[0]))
         {
           td2.titlecase++;
         }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
     }
   }
 	
-  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  // Condense the kgram distribution
+  for (auto& it : tstats)
   {
-    kgram klist = it->first;
-    std::map<token, token_data>& probtable = it->second;
-    std::map<int, token_data>& distribution = stats[klist];
+    kgram klist = it.first;
+    auto& probtable = it.second;
+    auto& distribution = stats[klist];
     int max = 0;
 		
-    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    for (auto& kt : probtable)
     {
-      max += kt->second.all;
+      max += kt.second.all;
 			
-      distribution[max] = kt->second;
-    }
-  }
-  
-  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
-  {
-    token word = it->first;
-    std::map<termstats, int>& probtable = it->second;
-    std::map<int, termstats>& distribution = endings[word];
-    int max = 0;
-    
-    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-    {
-      max += kt->second;
-      
-      distribution[max] = kt->first;
+      distribution.emplace(max, kt.second);
     }
   }
 }
 
 void printKgram(kgram k)
 {
-  for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (auto& q : k)
   {
-    query& q = *it;
-    if (q.type == querytype_sentence)
+    if (q.type == querytype::sentence)
     {
       std::cout << "#.# ";
-    } else if (q.type == querytype_literal)
+    } else if (q.type == querytype::literal)
     {
-      if (q.word.type == tokentype_hashtag)
+      if (q.tok.suffix == suffixtype::terminating)
       {
-        if (q.word.terminating)
-        {
-          std::cout << "#hashtag. ";
-        } else {
-          std::cout << "#hashtag ";
-        }
-      } else if (q.word.type == tokentype_literal)
+        std::cout << q.tok.w.canon << ". ";
+      } else if (q.tok.suffix == suffixtype::comma)
       {
-        if (q.word.terminating)
-        {
-          std::cout << q.word.canon << ". ";
-        } else {
-          std::cout << q.word.canon << " ";
-        }
+        std::cout << q.tok.w.canon << ", ";
+      } else {
+        std::cout << q.tok.w.canon << " ";
       }
     }
   }
 }
 
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::vector<std::string> kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int n)
 {
-  std::vector<std::string> result;
+  std::string result;
   kgram cur(1, wildcardQuery);
   int cuts = 0;
+  std::stack<parentype> open_delimiters;
 	
   for (int i=0; i<n; i++)
   {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
       cur = kgram(1, wildcardQuery);
     }
 
-    std::map<int, token_data>& distribution = stats[cur];
+    auto& distribution = stats[cur];
     int max = distribution.rbegin()->first;
     int r = rand() % max;
     token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken;
-    bool mess = false;
-    
-    if (next.word.type == tokentype_literal)
+    std::string nextToken = next.tok.w.forms.next();
+  
+    // Determine the casing of the next token. We randomly make the token all
+    // caps based on the markov chain. Otherwise, we check if the previous
+    // token is the end of a sentence (terminating token or a wildcard query).
+    int casing = rand() % next.all;
+    if (casing < next.uppercase)
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype::sentence)
+          || ((cur.rbegin()->type == querytype::literal)
+            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
     {
-      nextToken = next.word.canon;
+      nextToken[0] = toupper(nextToken[0]);
+    }
     
-      mess = (rand() % 100) == 0;
-      if (mess)
+    // Delimiters
+    for (auto& dt : next.tok.delimiters)
+    {
+      if (dt.first.status == doublestatus::both)
       {
-        nextToken = mstats.alternate(nextToken);
-      }
-    
-      // Determine the casing of the next token. We randomly make the token all
-      // caps based on the markov chain. Otherwise, we check if the previous
-      // token is the end of a sentence (terminating token or a wildcard query).
-      int casing = rand() % next.all;
-      if (casing < next.uppercase)
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
+        }
+      } else if (dt.first.status == doublestatus::opening)
       {
-        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
-      } else if ((((cur.rbegin()->type == querytype_sentence)
-            || ((cur.rbegin()->type == querytype_literal)
-              && (cur.rbegin()->word.terminating)))
-          && (rand() % 2 > 0))
-        || (casing - next.uppercase < next.titlecase))
+        for (int i=0; i<dt.second; i++)
+        {
+          open_delimiters.push(dt.first.type);
+        }
+        
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
+        }
+      } else if (dt.first.status == doublestatus::closing)
       {
-        nextToken[0] = toupper(nextToken[0]);
+        for (int i=0; i<dt.second; i++)
+        {
+          while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
+          {
+            switch (open_delimiters.top())
+            {
+              case parentype::paren: nextToken.append(")"); break;
+              case parentype::square_bracket: nextToken.append("]"); break;
+              case parentype::asterisk: nextToken.append("*"); break;
+              case parentype::quote: nextToken.append("\""); break;
+            }
+            
+            open_delimiters.pop();
+          }
+          
+          if (open_delimiters.empty())
+          {
+            switch (dt.first.type)
+            {
+              case parentype::paren: result = "(" + result; break;
+              case parentype::square_bracket: result = "[" + result; break;
+              case parentype::asterisk: result = "*" + result; break;
+              case parentype::quote: result = "\"" + result; break;
+            }
+          }
+          
+          switch (dt.first.type)
+          {
+            case parentype::paren: nextToken.append(")"); break;
+            case parentype::square_bracket: nextToken.append("]"); break;
+            case parentype::asterisk: nextToken.append("*"); break;
+            case parentype::quote: nextToken.append("\""); break;
+          }
+        }
       }
-    } else if (next.word.type == tokentype_hashtag)
-    {
-      int rhash = rand() % hashtags.size();
-      nextToken = hashtags[rhash];
     }
     
-    if (next.word.terminating)
+    // Terminators
+    if (next.tok.suffix == suffixtype::terminating)
     {
-      std::map<int, termstats>& ending = endings[next.word];
-      int emax = ending.rbegin()->first;
-      int er = rand() % emax;
-      termstats& nextend = ending.upper_bound(er)->second;
-    
-      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
+      nextToken.append(next.tok.w.terms.next());
+    } else if (next.tok.suffix == suffixtype::comma)
+    {
+      nextToken.append(",");
     }
 		
     /* DEBUG */
     printKgram(cur);
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
+
+    cur.push_back(next.tok);
 		
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
+    result.append(nextToken + " ");
     
-    if (mess)
+    if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
     {
-      std::cout << " mala " << next.word.canon;
+      break;
     }
-    
-    std::cout << std::endl;
-
-    cur.push_back(next.word);
-		
-    result.push_back(nextToken);
   }
-	
-  return result;
-}
-
-bool removeIf(char c)
-{
-  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
-}
-
-std::string canonize(std::string f)
-{
-  std::string canonical(f);
-  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
   
-  std::string result;
-  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  // Remove the trailing space
+  if (result.back() == ' ')
+  {
+    result.pop_back();
+  }
+  
+  // Close any open delimiters
+  while (!open_delimiters.empty())
+  {
+    switch (open_delimiters.top())
+    {
+      case parentype::paren: result.append(")"); break;
+      case parentype::square_bracket: result.append("]"); break;
+      case parentype::asterisk: result.append("*"); break;
+      case parentype::quote: result.append("\""); break;
+    }
+    
+    open_delimiters.pop();
+  }
 	
   return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index ff2fc66..a97d7bf 100644
--- a/kgramstats.h
+++ b/kgramstats.h
@@ -2,61 +2,89 @@
 #include <map>
 #include <list>
 #include <vector>
-#include "malaprop.h"
+#include "histogram.h"
 
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
 
-enum tokentype {
-  tokentype_literal,
-  tokentype_hashtag
+struct word {
+  std::string canon;
+  histogram<std::string> forms;
+  histogram<std::string> terms;
+  
+  word(std::string canon) : canon(canon) {}
+  
+  bool operator<(const word& other) const
+  {
+    return canon < other.canon;
+  }
 };
 
-struct token {
-  tokentype type;
-  std::string canon;
-  bool terminating;
+extern word blank_word;
+
+enum class suffixtype {
+  none,
+  terminating,
+  comma
+};
+
+enum class parentype {
+  paren,
+  square_bracket,
+  asterisk,
+  quote
+};
+
+enum class doublestatus {
+  opening,
+  closing,
+  both
+};
+
+struct delimiter {
+  parentype type;
+  doublestatus status;
+  
+  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
   
-  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
-  token(tokentype type) : type(type), canon(""), terminating(false) {}
+  bool operator<(const delimiter& other) const
+  {
+    return std::tie(type, status) < std::tie(other.type, other.status);
+  }
+};
+
+struct token {
+  const word& w;
+  std::map<delimiter, int> delimiters;
+  suffixtype suffix;
+  std::string raw;
+    
+  token(const word& w) : w(w), suffix(suffixtype::none) {}
   
   bool operator<(const token& other) const
   {
-    if (type != other.type)
-    {
-      return type < other.type;
-    } else if (type == tokentype_literal)
-    {
-      if (canon == other.canon)
-      {
-        return !terminating && other.terminating;
-      } else {
-        return canon < other.canon;
-      }
-    } else {
-      return !terminating && other.terminating;
-    }
+    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
   }
 };
 
-enum querytype {
-  querytype_literal,
-  querytype_sentence
+enum class querytype {
+  literal,
+  sentence
 };
 
 struct query {
   querytype type;
-  token word;
+  token tok;
   
-  query(token word) : word(word), type(querytype_literal) {}
+  query(token tok) : tok(tok), type(querytype::literal) {}
   
-  query(querytype type) : word(""), type(type) {}
+  query(querytype type) : tok(blank_word), type(type) {}
   
   bool operator<(const query& other) const
   {
     if (type == other.type)
     {
-      return word < other.word;
+      return tok < other.tok;
     } else {
       return type < other.type;
     }
@@ -65,34 +93,11 @@ struct query {
 
 typedef std::list<query> kgram;
 
-struct termstats {
-  char terminator;
-  int occurrences;
-  
-  termstats() : terminator('.'), occurrences(1) {}
-  
-  termstats(char terminator, int occurrences)
-  {
-    this->terminator = terminator;
-    this->occurrences = occurrences;
-  }
-  
-  bool operator<(const termstats& other) const
-  {
-    if (terminator == other.terminator)
-    {
-      return occurrences < other.occurrences;
-    } else {
-      return terminator < other.terminator;
-    }
-  }
-};
-
 class kgramstats
 {
 public:
 	kgramstats(std::string corpus, int maxK);
-	std::vector<std::string> randomSentence(int n);
+	std::string randomSentence(int n);
 	
 private:
 	struct token_data
@@ -100,16 +105,15 @@ private:
 		int all;
 		int titlecase;
 		int uppercase;
-    token word;
+    token tok;
     
-    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
+    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
 	};
   
 	int maxK;
 	std::map<kgram, std::map<int, token_data> > stats;
-  malaprop mstats;
-  std::map<token, std::map<int, termstats> > endings;
-  std::vector<std::string> hashtags;
+  word hashtags {"#hashtag"};
+  std::map<std::string, word> words;
 };
 
 void printKgram(kgram k);
-- 
cgit 1.4.1