8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa63a34..41c4552 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -8,10 +8,14 @@ find_package(curl)
 if (YamlCpp_FOUND AND CURL_FOUND)
  add_subdirectory(vendor/twitcurl/libtwitcurl)
  include_directories(vendor/twitcurl/libtwitcurl)
-  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
  target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
 else (YamlCpp_FOUND AND CURL_FOUND)
  message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
 endif (YamlCpp_FOUND AND CURL_FOUND)
-add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp
index e38ebab..ed1e080 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -44,20 +44,9 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
+    std::string doc = stats->randomSentence(rand() % 45 + 5);
-    std::string hi;
+    std::string hi = vars->parse(doc);
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
    hi.resize(140);
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
    
    std::string replyMsg;
    if (twitter.statusUpdate(hi))
diff --git a/freevars.cpp b/freevars.cpp
index 8c3eda4..54c5aab 100644
--- a/freevars.cpp
+++ b/freevars.cpp

@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in)
    for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
    {
        std::string tofind = "$" + it->first + "$";
-        size_t fpos = res.find(tofind);
+        size_t fpos;
-        if (fpos != std::string::npos)
+        while ((fpos = res.find(tofind)) != std::string::npos)
        {
            int r = rand() % it->second->size();
            res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
diff --git a/gen.cpp b/gen.cpp
index 400c0a5..a0ef8e3 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -52,21 +52,10 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
+    std::string doc = stats->randomSentence(rand() % 35 + 15);
-    std::string hi;
+    std::string hi = vars->parse(doc);
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
-    
    hi.resize(140);
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
    std::cout << hi << std::endl;
                
    getc(stdin);
diff --git a/histogram.cpp b/histogram.cpp
new file mode 100644
index 0000000..6896146
--- /dev/null
+++ b/histogram.cpp

@@ -0,0 +1,34 @@
+#include "histogram.h"
+#include <cstdlib>
+template <class T>
+void histogram<T>::add(const T& inst)
+{
+  freqtable[inst]++;
+}
+template <class T>
+void histogram<T>::compile()
+{
+  distribution.clear();
+  
+  int max = 0;
+  for (auto& it : freqtable)
+  {
+    max += it.second;
+    distribution.emplace(max, it.first);
+  }
+  
+  freqtable.clear();
+}
+template <class T>
+const T& histogram<T>::next() const
+{
+  int max = distribution.rbegin()->first;
+  int r = rand() % max;
+  
+  return distribution.upper_bound(r)->second;
+}
+template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h
new file mode 100644
index 0000000..5aa2560
--- /dev/null
+++ b/histogram.h

@@ -0,0 +1,19 @@
+#ifndef HISTOGRAM_H_24094D97
+#define HISTOGRAM_H_24094D97
+#include <map>
+#include <string>
+template <class T>
+class histogram {
+  public:
+    void add(const T& inst);
+    void compile();
+    const T& next() const;
+    
+  private:
+    std::map<T, int> freqtable;
+    std::map<int, T> distribution;
+};
+#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 4bb7f15..0ab0c99 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -37,35 +37,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
-#include "malaprop.h"
+#include <set>
+#include <stack>
-query wildcardQuery(querytype_sentence);
+query wildcardQuery {querytype::sentence};
+word blank_word {""};
-std::string canonize(std::string f);
-token token_from_string(std::string in)
-{
-  if (in[0] == '#')
-  {
-    token word(tokentype_hashtag);
-    
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  } else {
-    token word(canonize(in));
-  
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  }
-}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
 {
  this->maxK = maxK;
  
-  std::vector<std::string> tokens;
+  std::vector<token> tokens;
  size_t start = 0;
  int end = 0;
  std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  {
    end = corpus.find(" ", start);
-    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    if (token[token.length()-1] == '\n')
+    if (t.compare("") && t.compare("."))
    {
-      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
+      std::string tc(t), canonical;
+      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
+        return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+      });
+      
+      word& w = ([&] () -> word& {
+        // Hashtag freevar
+        if (canonical[0] == '#')
+        {
+          thashtags.insert(canonical);
+          canonical = "#hashtag";
+          
+          return hashtags;
+        }
+        
+        // Basically any other word
+        if (words.count(canonical) == 0)
+        {
+          words.emplace(canonical, canonical);
+        }
+        
+        word& tw = words.at(canonical);
+        tw.forms.add(canonical);
+        
+        return tw;
+      })();
+      
+      token tk(w);
+      tk.raw = t;
+      
+      for (char c : t)
      {
-        token.insert(token.length()-1, ".");
+        if (c == '*')
+        {
+          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+        } else if (c == '[')
+        {
+          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+        } else if (c == '(')
+        {
+          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+        } else if (c == '"')
+        {
+          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+        } else {
+          break;
+        }
      }
-         
-      token.resize(token.length()-1);
-    }
-       
-    if (token.compare("") && token.compare("."))
-    {
-      mstats.addWord(token);
-      tokens.push_back(token);
      
-      if (token[0] == '#')
+      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+      if (backtrack != t.length())
      {
-        thashtags.insert(canonize(token));
+        std::string ending = t.substr(backtrack);
+        std::string suffix;
+        
+        for (char c : ending)
+        {
+          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          {
+            suffix += c;
+            
+            continue;
+          } else if (c == '\n')
+          {
+            // At least the end is coming
+            if (suffix.empty())
+            {
+              suffix = ".";
+            }
+            
+            break;
+          }
+          
+          parentype pt = ([&] {
+            switch (c)
+            {
+              case ']': return parentype::square_bracket;
+              case ')': return parentype::paren;
+              case '*': return parentype::asterisk;
+              case '"': return parentype::quote;
+            }
+          })();
+          
+          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
+          {
+            tk.delimiters[{pt, doublestatus::opening}]--;
+            tk.delimiters[{pt, doublestatus::both}]++;
+          } else {
+            tk.delimiters[{pt, doublestatus::closing}]++;
+          }
+        }
+        
+        if (suffix == ",")
+        {
+          tk.suffix = suffixtype::comma;
+        } else if (!suffix.empty()) {
+          tk.suffix = suffixtype::terminating;
+          
+          w.terms.add(suffix);
+        }
      }
+      
+      tokens.push_back(tk);
    }
    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
  }
  
-  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  // Time to condense the distribution stuff for the words
+  for (auto& it : words)
  {
-    hashtags.push_back(*it);
+    it.second.forms.compile();
+    it.second.terms.compile();
  }
-        
+  
+  // Hashtag freevar is not frequency distributed
+  for (auto& it : thashtags)
+  {
+    hashtags.forms.add(it);
+  }
+  
+  hashtags.forms.compile();
+  hashtags.terms.compile();
+  // kgram distribution
  std::map<kgram, std::map<token, token_data> > tstats;
-  std::map<token, std::map<termstats, int> > tendings;
  for (int k=1; k<maxK; k++)
  {
    for (int i=0; i<(tokens.size() - k); i++)
    {
-      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
-      kgram prefix;
+      token f = tokens[i+k];
-      
-      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
+      if (tstats[prefix].count(f) == 0)
-      {
-        prefix.push_back(token_from_string(*it));
-      }
-      
-      std::string f = tokens[i+k];
-                  std::string canonical = canonize(f);
-      
-      token word(token_from_string(canonical));
-      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        word.terminating = true;
+        tstats[prefix].emplace(f, f);
-        
-        char terminator = f[f.find_last_of(".?!,")];
-        int occurrences = std::count(f.begin(), f.end(), terminator);
-        
-        tendings[word][termstats(terminator, occurrences)]++;
      }
                        
-      token_data& td = tstats[prefix][word];
+      token_data& td = tstats[prefix].at(f);
-      td.word = word;
      td.all++;
-      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td.uppercase++;
-      } else if (isupper(f[0]))
+      } else if (isupper(f.raw[0]))
      {
        td.titlecase++;
      }
      
-      if (prefix.front().word.terminating)
+      kgram term_prefix;
+      bool changed = false;
+      std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
+        if (q.tok.suffix == suffixtype::terminating)
+        {
+          changed = true;
+          
+          return wildcardQuery;
+        } else {
+          return q;
+        }
+      });
+      
+      if (changed)
      {
-        prefix.front() = wildcardQuery;
+        if (tstats[term_prefix].count(f) == 0)
+        {
+          tstats[term_prefix].emplace(f, f);
+        }
        
-        token_data& td2 = tstats[prefix][word];
+        token_data& td2 = tstats[term_prefix].at(f);
-        td2.word = word;
        td2.all++;
-        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td2.uppercase++;
-        } else if (isupper(f[0]))
+        } else if (isupper(f.raw[0]))
        {
          td2.titlecase++;
        }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    }
  }
        
-  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  // Condense the kgram distribution
+  for (auto& it : tstats)
  {
-    kgram klist = it->first;
+    kgram klist = it.first;
-    std::map<token, token_data>& probtable = it->second;
+    auto& probtable = it.second;
-    std::map<int, token_data>& distribution = stats[klist];
+    auto& distribution = stats[klist];
    int max = 0;
                
-    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    for (auto& kt : probtable)
    {
-      max += kt->second.all;
+      max += kt.second.all;
                        
-      distribution[max] = kt->second;
+      distribution.emplace(max, kt.second);
-    }
-  }
-  
-  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
-  {
-    token word = it->first;
-    std::map<termstats, int>& probtable = it->second;
-    std::map<int, termstats>& distribution = endings[word];
-    int max = 0;
-    
-    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-    {
-      max += kt->second;
-      
-      distribution[max] = kt->first;
    }
  }
 }
 void printKgram(kgram k)
 {
-  for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (auto& q : k)
  {
-    query& q = *it;
+    if (q.type == querytype::sentence)
-    if (q.type == querytype_sentence)
    {
      std::cout << "#.# ";
-    } else if (q.type == querytype_literal)
+    } else if (q.type == querytype::literal)
    {
-      if (q.word.type == tokentype_hashtag)
+      if (q.tok.suffix == suffixtype::terminating)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ". ";
-        {
+      } else if (q.tok.suffix == suffixtype::comma)
-          std::cout << "#hashtag. ";
-        } else {
-          std::cout << "#hashtag ";
-        }
-      } else if (q.word.type == tokentype_literal)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ", ";
-        {
+      } else {
-          std::cout << q.word.canon << ". ";
+        std::cout << q.tok.w.canon << " ";
-        } else {
-          std::cout << q.word.canon << " ";
-        }
      }
    }
  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::vector<std::string> kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int n)
 {
-  std::vector<std::string> result;
+  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
+  std::stack<parentype> open_delimiters;
        
  for (int i=0; i<n; i++)
  {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      cur = kgram(1, wildcardQuery);
    }
-    std::map<int, token_data>& distribution = stats[cur];
+    auto& distribution = stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken;
+    std::string nextToken = next.tok.w.forms.next();
-    bool mess = false;
+  
-    
+    // Determine the casing of the next token. We randomly make the token all
-    if (next.word.type == tokentype_literal)
+    // caps based on the markov chain. Otherwise, we check if the previous
+    // token is the end of a sentence (terminating token or a wildcard query).
+    int casing = rand() % next.all;
+    if (casing < next.uppercase)
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype::sentence)
+          || ((cur.rbegin()->type == querytype::literal)
+            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
-      nextToken = next.word.canon;
+      nextToken[0] = toupper(nextToken[0]);
+    }
    
-      mess = (rand() % 100) == 0;
+    // Delimiters
-      if (mess)
+    for (auto& dt : next.tok.delimiters)
+    {
+      if (dt.first.status == doublestatus::both)
      {
-        nextToken = mstats.alternate(nextToken);
+        switch (dt.first.type)
-      }
+        {
-    
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
-      // Determine the casing of the next token. We randomly make the token all
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
-      // caps based on the markov chain. Otherwise, we check if the previous
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
-      // token is the end of a sentence (terminating token or a wildcard query).
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
-      int casing = rand() % next.all;
+        }
-      if (casing < next.uppercase)
+      } else if (dt.first.status == doublestatus::opening)
      {
-        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+        for (int i=0; i<dt.second; i++)
-      } else if ((((cur.rbegin()->type == querytype_sentence)
+        {
-            || ((cur.rbegin()->type == querytype_literal)
+          open_delimiters.push(dt.first.type);
-              && (cur.rbegin()->word.terminating)))
+        }
-          && (rand() % 2 > 0))
+        
-        || (casing - next.uppercase < next.titlecase))
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
+        }
+      } else if (dt.first.status == doublestatus::closing)
      {
-        nextToken[0] = toupper(nextToken[0]);
+        for (int i=0; i<dt.second; i++)
+        {
+          while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
+          {
+            switch (open_delimiters.top())
+            {
+              case parentype::paren: nextToken.append(")"); break;
+              case parentype::square_bracket: nextToken.append("]"); break;
+              case parentype::asterisk: nextToken.append("*"); break;
+              case parentype::quote: nextToken.append("\""); break;
+            }
+            
+            open_delimiters.pop();
+          }
+          
+          if (open_delimiters.empty())
+          {
+            switch (dt.first.type)
+            {
+              case parentype::paren: result = "(" + result; break;
+              case parentype::square_bracket: result = "[" + result; break;
+              case parentype::asterisk: result = "*" + result; break;
+              case parentype::quote: result = "\"" + result; break;
+            }
+          }
+          
+          switch (dt.first.type)
+          {
+            case parentype::paren: nextToken.append(")"); break;
+            case parentype::square_bracket: nextToken.append("]"); break;
+            case parentype::asterisk: nextToken.append("*"); break;
+            case parentype::quote: nextToken.append("\""); break;
+          }
+        }
      }
-    } else if (next.word.type == tokentype_hashtag)
-    {
-      int rhash = rand() % hashtags.size();
-      nextToken = hashtags[rhash];
    }
    
-    if (next.word.terminating)
+    // Terminators
+    if (next.tok.suffix == suffixtype::terminating)
    {
-      std::map<int, termstats>& ending = endings[next.word];
+      nextToken.append(next.tok.w.terms.next());
-      int emax = ending.rbegin()->first;
+    } else if (next.tok.suffix == suffixtype::comma)
-      int er = rand() % emax;
+    {
-      termstats& nextend = ending.upper_bound(er)->second;
+      nextToken.append(",");
-    
-      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
                
    /* DEBUG */
    printKgram(cur);
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
+    cur.push_back(next.tok);
                
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
+    result.append(nextToken + " ");
    
-    if (mess)
+    if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
    {
-      std::cout << " mala " << next.word.canon;
+      break;
    }
-    
-    std::cout << std::endl;
-    cur.push_back(next.word);
-                
-    result.push_back(nextToken);
  }
-        
-  return result;
-}
-bool removeIf(char c)
-{
-  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
-}
-std::string canonize(std::string f)
-{
-  std::string canonical(f);
-  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  std::string result;
+  // Remove the trailing space
-  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  if (result.back() == ' ')
+  {
+    result.pop_back();
+  }
+  
+  // Close any open delimiters
+  while (!open_delimiters.empty())
+  {
+    switch (open_delimiters.top())
+    {
+      case parentype::paren: result.append(")"); break;
+      case parentype::square_bracket: result.append("]"); break;
+      case parentype::asterisk: result.append("*"); break;
+      case parentype::quote: result.append("\""); break;
+    }
+    
+    open_delimiters.pop();
+  }
        
  return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index ff2fc66..a97d7bf 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -2,61 +2,89 @@
 #include <map>
 #include <list>
 #include <vector>
-#include "malaprop.h"
+#include "histogram.h"
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-enum tokentype {
+struct word {
-  tokentype_literal,
+  std::string canon;
-  tokentype_hashtag
+  histogram<std::string> forms;
+  histogram<std::string> terms;
+  
+  word(std::string canon) : canon(canon) {}
+  
+  bool operator<(const word& other) const
+  {
+    return canon < other.canon;
+  }
 };
-struct token {
+extern word blank_word;
-  tokentype type;
-  std::string canon;
+enum class suffixtype {
-  bool terminating;
+  none,
+  terminating,
+  comma
+};
+enum class parentype {
+  paren,
+  square_bracket,
+  asterisk,
+  quote
+};
+enum class doublestatus {
+  opening,
+  closing,
+  both
+};
+struct delimiter {
+  parentype type;
+  doublestatus status;
+  
+  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
+  bool operator<(const delimiter& other) const
-  token(tokentype type) : type(type), canon(""), terminating(false) {}
+  {
+    return std::tie(type, status) < std::tie(other.type, other.status);
+  }
+};
+struct token {
+  const word& w;
+  std::map<delimiter, int> delimiters;
+  suffixtype suffix;
+  std::string raw;
+    
+  token(const word& w) : w(w), suffix(suffixtype::none) {}
  
  bool operator<(const token& other) const
  {
-    if (type != other.type)
+    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-    {
-      return type < other.type;
-    } else if (type == tokentype_literal)
-    {
-      if (canon == other.canon)
-      {
-        return !terminating && other.terminating;
-      } else {
-        return canon < other.canon;
-      }
-    } else {
-      return !terminating && other.terminating;
-    }
  }
 };
-enum querytype {
+enum class querytype {
-  querytype_literal,
+  literal,
-  querytype_sentence
+  sentence
 };
 struct query {
  querytype type;
-  token word;
+  token tok;
  
-  query(token word) : word(word), type(querytype_literal) {}
+  query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : word(""), type(type) {}
+  query(querytype type) : tok(blank_word), type(type) {}
  
  bool operator<(const query& other) const
  {
    if (type == other.type)
    {
-      return word < other.word;
+      return tok < other.tok;
    } else {
      return type < other.type;
    }
@@ -65,34 +93,11 @@ struct query {
 typedef std::list<query> kgram;
-struct termstats {
-  char terminator;
-  int occurrences;
-  
-  termstats() : terminator('.'), occurrences(1) {}
-  
-  termstats(char terminator, int occurrences)
-  {
-    this->terminator = terminator;
-    this->occurrences = occurrences;
-  }
-  
-  bool operator<(const termstats& other) const
-  {
-    if (terminator == other.terminator)
-    {
-      return occurrences < other.occurrences;
-    } else {
-      return terminator < other.terminator;
-    }
-  }
-};
 class kgramstats
 {
 public:
        kgramstats(std::string corpus, int maxK);
-        std::vector<std::string> randomSentence(int n);
+        std::string randomSentence(int n);
        
 private:
        struct token_data
@@ -100,16 +105,15 @@ private:
                int all;
                int titlecase;
                int uppercase;
-    token word;
+    token tok;
    
-    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
+    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
        };
  
        int maxK;
        std::map<kgram, std::map<int, token_data> > stats;
-  malaprop mstats;
+  word hashtags {"#hashtag"};
-  std::map<token, std::map<int, termstats> > endings;
+  std::map<std::string, word> words;
-  std::vector<std::string> hashtags;
 };
 void printKgram(kgram k);