8 files changed, 443 insertions, 374 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ab1979f..a3f51af 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -12,12 +12,17 @@ include_directories(vendor/yaml-cpp/include)
 find_package(ASPELL REQUIRED)
 include_directories(${ASPELL_INCLUDE_DIR})
-add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
+add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp)
+set_property(TARGET rawr PROPERTY CXX_STANDARD 11)
+set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON)
+target_link_libraries(rawr ${ASPELL_LIBRARIES})
+add_executable(rawr-ebooks ebooks.cpp)
 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
-target_link_libraries(rawr-ebooks yaml-cpp twitter++ curlcpp curl ${ASPELL_LIBRARIES} pthread)
+target_link_libraries(rawr-ebooks rawr yaml-cpp twitter++ curlcpp curl pthread)
-add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
+add_executable(rawr-gen gen.cpp)
 set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
 set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
-target_link_libraries(rawr-gen ${ASPELL_LIBRARIES})
+target_link_libraries(rawr-gen rawr)
diff --git a/ebooks.cpp b/ebooks.cpp
index aa690c2..c01cdc9 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -39,9 +39,41 @@ int main(int argc, char** args)
    
    corpus += line + "\n ";
  }
+  
+  // Replace old-style freevars while I can't be bothered to remake the corpus yet
+  std::vector<std::string> fv_names;
+  std::ifstream namefile("names.txt");
+  if (namefile.is_open())
+  {
+    while (!namefile.eof())
+    {
+      std::string l;
+      getline(namefile, l);
+      if (l.back() == '\r')
+      {
+        l.pop_back();
+      }
+      
+      fv_names.push_back(l);
+    }
+  }
+  
+  namefile.close();
  std::cout << "Preprocessing corpus..." << std::endl;
-  kgramstats* stats = new kgramstats(corpus, 4);
+  rawr kgramstats;
+  kgramstats.addCorpus(corpus);
+  kgramstats.compile(4);
+  kgramstats.setTransformCallback([&] (std::string canonical, std::string) {
+    size_t pos = canonical.find("$name$");
+    if (pos != std::string::npos)
+    {
+      canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]);
+    }
+    
+    return canonical;
+  });
+  
  std::mutex stats_mutex;
  
  client.setUserStreamNotifyCallback([&] (twitter::notification n) {
@@ -60,7 +92,7 @@ int main(int argc, char** args)
          std::string doc = "@" + n.getTweet().getAuthor().getScreenName() + " ";
          {
            std::lock_guard<std::mutex> stats_lock(stats_mutex);
-            doc += stats->randomSentence(140 - doc.length());
+            doc += kgramstats.randomSentence(140 - doc.length());
            doc.resize(140);
          }
        
@@ -84,7 +116,7 @@ int main(int argc, char** args)
    std::string doc;
    {
      std::lock_guard<std::mutex> stats_lock(stats_mutex);
-      doc = stats->randomSentence(140);
+      doc = kgramstats.randomSentence(140);
    }
    doc.resize(140);
    
diff --git a/freevars.cpp b/freevars.cpp
deleted file mode 100644
index 4429d00..0000000
--- a/freevars.cpp
+++ /dev/null

@@ -1,32 +0,0 @@
-#include "freevars.h"
-#include <fstream>
-#include "kgramstats.h"
-freevar::freevar(word& w, std::string file) : w(w)
-{
-  std::ifstream infile(file);
-  if (infile)
-  {
-    std::string line;
-    while (getline(infile, line))
-    {
-      instances.insert(line);
-      w.forms.add(line);
-    }
-  }
-}
-bool freevar::check(std::string f) const
-{
-  return (instances.count(f) == 1);
-}
-void freevar::add(std::string f)
-{
-  instances.insert(f);
-}
-word& freevar::getWord()
-{
-  return w;
-}
diff --git a/freevars.h b/freevars.h
deleted file mode 100644
index f800220..0000000
--- a/freevars.h
+++ /dev/null

@@ -1,22 +0,0 @@
-#include <string>
-#include <set>
-#ifndef FREEVARS_H
-#define FREEVARS_H
-class word;
-class freevar
-{
-  public:
-    freevar(word& w, std::string file);
-    bool check(std::string f) const;
-    void add(std::string f);
-    word& getWord();
-    
-  private:
-    word& w;
-    std::set<std::string> instances;
-};
-#endif
-\ No newline at end of file
diff --git a/gen.cpp b/gen.cpp
index 0319283..eba0277 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -44,18 +44,48 @@ int main(int argc, char** args)
    
    corpus += line + "\n ";
  }
+  
+  // Replace old-style freevars while I can't be bothered to remake the corpus yet
+  std::vector<std::string> fv_names;
+  std::ifstream namefile("names.txt");
+  if (namefile.is_open())
+  {
+    while (!namefile.eof())
+    {
+      std::string l;
+      getline(namefile, l);
+      if (l.back() == '\r')
+      {
+        l.pop_back();
+      }
+      
+      fv_names.push_back(l);
+    }
+  }
+  
+  namefile.close();
        
  std::cout << "Preprocessing corpus..." << std::endl;
-  kgramstats* stats = new kgramstats(corpus, 4);
+  rawr kgramstats;
+  kgramstats.addCorpus(corpus);
+  kgramstats.compile(4);
+  kgramstats.setTransformCallback([&] (std::string canonical, std::string) {
+    size_t pos = canonical.find("$name$");
+    if (pos != std::string::npos)
+    {
+      canonical.replace(pos, 6, fv_names[rand() % fv_names.size()]);
+    }
+    
+    return canonical;
+  });
    
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::string doc = stats->randomSentence(140);
+    std::string doc = kgramstats.randomSentence(140);
-    std::string hi = doc;
+    doc.resize(140);
-    hi.resize(140);
-    std::cout << hi << std::endl;
+    std::cout << doc << std::endl;
                
    getc(stdin);
  }
diff --git a/kgramstats.cpp b/kgramstats.cpp
index a44bf2b..47f3bc0 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -33,32 +33,47 @@
 //  
 #include "kgramstats.h"
-#include <vector>
 #include <iostream>
-#include <cstdlib>
 #include <cstring>
 #include <algorithm>
 #include <set>
 #include <stack>
-#include "freevars.h"
-#include <fstream>
 #include "prefix_search.h"
 #include <aspell.h>
+#include <fstream>
+const rawr::query rawr::wildcardQuery = {querytype::sentence};
+const rawr::word rawr::blank_word = {""};
-query wildcardQuery {querytype::sentence};
+void rawr::addCorpus(std::string corpus)
-word blank_word {""};
+{
+  _corpora.push_back(corpus);
+}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
-kgramstats::kgramstats(std::string corpus, int maxK)
+void rawr::compile(int maxK)
 {
-  this->maxK = maxK;
+  _maxK = maxK;
  
  std::vector<token> tokens;
  size_t start = 0;
-  int end = 0;
  std::set<std::string> thashtags;
-  freevar fv_emoticons {emoticons, "emoticons.txt"};
+  std::set<std::string> fv_emoticons;
+  
+  std::ifstream fvefile("emoticons.txt");
+  if (fvefile)
+  {
+    std::string line;
+    while (getline(fvefile, line))
+    {
+      fv_emoticons.insert(line);
+      emoticons.forms.add(line);
+    }
+  }
+  
+  fvefile.close();
+  
  std::map<std::string, std::string> canonical_form;
  
  AspellConfig* spell_config = new_aspell_config();
@@ -92,216 +107,229 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
  std::cout << "Tokenizing corpus...   0%" << std::flush;
-  int len = corpus.length();
+  int len = 0;
+  for (auto c : _corpora)
+  {
+    len += c.length();
+  }
+  
+  int startper = 0;
  int per = 0;
  int perprime = 0;
  std::cout.fill(' ');
-  while (end != std::string::npos)
+  for (int i = 0; i < _corpora.size(); i++)
  {
-    perprime = end * 100 / len;
+    int end = 0;
-    if (perprime != per)
+    
+    while (end != std::string::npos)
    {
-      per = perprime;
+      perprime = (startper + end) * 100 / len;
+      if (perprime != per)
+      {
+        per = perprime;
      
-      std::cout << "\b\b\b\b" << std::right;
+        std::cout << "\b\b\b\b" << std::right;
-      std::cout.width(3);
+        std::cout.width(3);
-      std::cout << per << "%" << std::flush;
+        std::cout << per << "%" << std::flush;
-    }
+      }
    
-    end = corpus.find(" ", start);
+      end = _corpora[i].find(" ", start);
-    bool emoji = false;
+      bool emoji = false;
-    std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+      std::string te = _corpora[i].substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    std::string t = "";
+      std::string t = "";
    
-    if (te.compare("") && te.compare("."))
+      if (te.compare("") && te.compare("."))
-    {
-      // Extract strings of emojis into their own tokens even if they're not space delimited
-      int m = emojis.match(te);
-      emoji = m > 0;
-      if (m == 0) m = 1;
-      t = te.substr(0,m);
-      te = te.substr(m);
-      
-      while (!te.empty())
      {
-        m = emojis.match(te);
+        // Extract strings of emojis into their own tokens even if they're not space delimited
-        if (emoji == (m > 0))
+        int m = emojis.match(te);
+        emoji = m > 0;
+        if (m == 0) m = 1;
+        t = te.substr(0,m);
+        te = te.substr(m);
+      
+        while (!te.empty())
        {
-          if (m == 0) m = 1;
+          m = emojis.match(te);
-          t += te.substr(0,m);
+          if (emoji == (m > 0))
-          te = te.substr(m);
+          {
-        } else {
+            if (m == 0) m = 1;
-          end = start + t.length() - 1;
+            t += te.substr(0,m);
-          break;
+            te = te.substr(m);
+          } else {
+            end = start + t.length() - 1;
+            break;
+          }
        }
-      }
      
-      std::string tc(t);
+        std::string tc(t);
-      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+        std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
-      int pst = tc.find_first_not_of("\"([*");
+        int pst = tc.find_first_not_of("\"([*");
-      int dst = tc.find_last_not_of("\")]*.,?!\n");
+        int dst = tc.find_last_not_of("\")]*.,?!\n");
-      std::string canonical("");
+        std::string canonical("");
-      if ((pst != std::string::npos) && (dst != std::string::npos))
+        if ((pst != std::string::npos) && (dst != std::string::npos))
-      {
-        canonical = std::string(tc, pst, dst - pst + 1);
-      }
-      
-      word& w = ([&] () -> word& {
-        // Hashtag freevar
-        if (canonical[0] == '#')
        {
-          thashtags.insert(canonical);
+          canonical = std::string(tc, pst, dst - pst + 1);
-          
-          return hashtags;
        }
-        
+      
-        // Emoticon freevar
+        word& w = ([&] () -> word& {
-        if (emoji)
+          // Hashtag freevar
-        {
+          if (canonical[0] == '#')
-          emoticons.forms.add(canonical);
+          {
+            thashtags.insert(canonical);
          
-          return emoticons;
+            return hashtags;
-        }
+          }
        
-        if ((pst != std::string::npos) && (dst != std::string::npos))
+          // Emoticon freevar
-        {
+          if (emoji)
-          std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
-          if (fv_emoticons.check(emoticon_canon))
          {
-            emoticons.forms.add(emoticon_canon);
+            emoticons.forms.add(canonical);
          
            return emoticons;
          }
-        }
        
-        // Basically any other word
+          if ((pst != std::string::npos) && (dst != std::string::npos))
-        if (canonical_form.count(canonical) == 0)
-        {
-          if (
-            // Legacy freevars should be distinct from tokens containing similar words
-            (canonical.find("$name$") != std::string::npos)
-            // Words with no letters will be mangled by the spell checker
-            || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
-            )
          {
-            canonical_form[canonical] = canonical;
+            std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
-            words.emplace(canonical, canonical);
+            if (fv_emoticons.count(emoticon_canon) == 1)
-          } else {
+            {
-            int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
+              emoticons.forms.add(emoticon_canon);
-            if (correct)
+          
+              return emoticons;
+            }
+          }
+        
+          // Basically any other word
+          if (canonical_form.count(canonical) == 0)
+          {
+            if (
+              // Legacy freevars should be distinct from tokens containing similar words
+              (canonical.find("$name$") != std::string::npos)
+              // Words with no letters will be mangled by the spell checker
+              || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
+              )
            {
-              words.emplace(canonical, canonical);
              canonical_form[canonical] = canonical;
+              words.emplace(canonical, canonical);
            } else {
-              const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
+              int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
-              AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
+              if (correct)
-              const char* replacement = aspell_string_enumeration_next(elements);
-              if (replacement != NULL)
              {
-                std::string sugrep(replacement);
-                canonical_form[canonical] = sugrep;
-          
-                if (words.count(sugrep) == 0)
-                {
-                  words.emplace(sugrep, sugrep);
-                }
-              } else {
                words.emplace(canonical, canonical);
                canonical_form[canonical] = canonical;
-              }
+              } else {
+                const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
+                AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
+                const char* replacement = aspell_string_enumeration_next(elements);
+                if (replacement != NULL)
+                {
+                  std::string sugrep(replacement);
+                  canonical_form[canonical] = sugrep;
          
-              delete_aspell_string_enumeration(elements);
+                  if (words.count(sugrep) == 0)
+                  {
+                    words.emplace(sugrep, sugrep);
+                  }
+                } else {
+                  words.emplace(canonical, canonical);
+                  canonical_form[canonical] = canonical;
+                }
+          
+                delete_aspell_string_enumeration(elements);
+              }
            }
          }
-        }
        
-        word& tw = words.at(canonical_form.at(canonical));
+          word& tw = words.at(canonical_form.at(canonical));
-        tw.forms.add(canonical);
+          tw.forms.add(canonical);
        
-        return tw;
+          return tw;
-      })();
+        })();
      
-      token tk(w);
+        token tk(w);
-      tk.raw = t;
+        tk.raw = t;
      
-      for (char c : t)
+        for (char c : t)
-      {
-        if (c == '*')
        {
-          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+          if (c == '*')
-        } else if (c == '[')
+          {
-        {
+            tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
-          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+          } else if (c == '[')
-        } else if (c == '(')
+          {
-        {
+            tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
-          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+          } else if (c == '(')
-        } else if (c == '"')
+          {
-        {
+            tk.delimiters[{parentype::paren, doublestatus::opening}]++;
-          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+          } else if (c == '"')
-        } else {
+          {
-          break;
+            tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+          } else {
+            break;
+          }
        }
-      }
      
-      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+        int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
-      if (backtrack != t.length())
+        if (backtrack != t.length())
-      {
-        std::string ending = t.substr(backtrack);
-        std::string suffix;
-        
-        for (char c : ending)
        {
-          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          std::string ending = t.substr(backtrack);
+          std::string suffix;
+        
+          for (char c : ending)
          {
-            suffix += c;
+            if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+            {
+              suffix += c;
            
-            continue;
+              continue;
-          } else if (c == '\n')
+            } else if (c == '\n')
-          {
-            // At least the end is coming
-            if (suffix.empty())
            {
-              suffix = ".";
+              // At least the end is coming
-            }
+              if (suffix.empty())
+              {
+                suffix = ".";
+              }
            
-            break;
+              break;
-          }
+            }
+          
+            parentype pt = ([&] {
+              switch (c)
+              {
+                case ']': return parentype::square_bracket;
+                case ')': return parentype::paren;
+                case '*': return parentype::asterisk;
+                case '"': return parentype::quote;
+              }
+            })();
          
-          parentype pt = ([&] {
+            if (tk.delimiters[{pt, doublestatus::opening}] > 0)
-            switch (c)
            {
-              case ']': return parentype::square_bracket;
+              tk.delimiters[{pt, doublestatus::opening}]--;
-              case ')': return parentype::paren;
+              tk.delimiters[{pt, doublestatus::both}]++;
-              case '*': return parentype::asterisk;
+            } else {
-              case '"': return parentype::quote;
+              tk.delimiters[{pt, doublestatus::closing}]++;
            }
-          })();
-          
-          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
-          {
-            tk.delimiters[{pt, doublestatus::opening}]--;
-            tk.delimiters[{pt, doublestatus::both}]++;
-          } else {
-            tk.delimiters[{pt, doublestatus::closing}]++;
          }
-        }
        
-        if (suffix == ",")
+          if (suffix == ",")
-        {
+          {
-          tk.suffix = suffixtype::comma;
+            tk.suffix = suffixtype::comma;
-        } else if (!suffix.empty()) {
+          } else if (!suffix.empty()) {
-          tk.suffix = suffixtype::terminating;
+            tk.suffix = suffixtype::terminating;
          
-          w.terms.add(suffix);
+            w.terms.add(suffix);
+          }
        }
-      }
      
-      tokens.push_back(tk);
+        tokens.push_back(tk);
-    }
+      }
-    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+      start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+    }
+    
+    startper += _corpora[i].length();
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
@@ -420,7 +448,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    
    kgram klist = it.first;
    auto& probtable = it.second;
-    auto& distribution = stats[klist];
+    auto& distribution = _stats[klist];
    int max = 0;
                
    for (auto& kt : probtable)
@@ -432,33 +460,61 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
  
  std::cout << "\b\b\b\b100%" << std::endl;
+  
+  _compiled = true;
 }
-void printKgram(kgram k)
+std::ostream& operator<<(std::ostream& os, rawr::kgram k)
 {
  for (auto& q : k)
  {
-    if (q.type == querytype::sentence)
+    os << q << " ";
-    {
-      std::cout << "#.# ";
-    } else if (q.type == querytype::literal)
-    {
-      if (q.tok.suffix == suffixtype::terminating)
-      {
-        std::cout << q.tok.w.canon << ". ";
-      } else if (q.tok.suffix == suffixtype::comma)
-      {
-        std::cout << q.tok.w.canon << ", ";
-      } else {
-        std::cout << q.tok.w.canon << " ";
-      }
-    }
  }
+  
+  return os;
+}
+std::ostream& operator<<(std::ostream& os, rawr::query q)
+{
+  if (q.type == rawr::querytype::sentence)
+  {
+    return os << "#.#";
+  } else if (q.type == rawr::querytype::literal)
+  {
+    return os << q.tok;
+  }
+  
+  return os;
+}
+std::ostream& operator<<(std::ostream& os, rawr::token t)
+{
+  os << t.w.canon;
+  
+  if (t.suffix == rawr::suffixtype::terminating)
+  {
+    return os << ".";
+  } else if (t.suffix == rawr::suffixtype::comma)
+  {
+    return os << ",";
+  } else {
+    return os;
+  }
+}
+void rawr::setTransformCallback(transform_callback _arg)
+{
+  _transform = _arg;
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::string kgramstats::randomSentence(int maxL)
+std::string rawr::randomSentence(int maxL)
 {
+  if (!_compiled)
+  {
+    return "";
+  }
+  
  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
@@ -466,14 +522,14 @@ std::string kgramstats::randomSentence(int maxL)
        
  for (;;)
  {
-    if (cur.size() == maxK)
+    if (cur.size() == _maxK)
    {
      cur.pop_front();
    }
    
    if (cur.size() > 0)
    {
-      if (rand() % (maxK - cur.size() + 1) == 0)
+      if (rand() % (_maxK - cur.size() + 1) == 0)
      {
        while ((cur.size() > 2) && (cuts > 0))
        {
@@ -490,16 +546,22 @@ std::string kgramstats::randomSentence(int maxL)
    
    // Gotta circumvent the last line of the input corpus
    // https://twitter.com/starla4444/status/684222271339237376
-    if (stats.count(cur) == 0)
+    if (_stats.count(cur) == 0)
    {
      cur = kgram(1, wildcardQuery);
    }
-    auto& distribution = stats[cur];
+    auto& distribution = _stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
    std::string nextToken = next.tok.w.forms.next();
+    
+    // Apply user-specified transforms
+    if (_transform)
+    {
+      nextToken = _transform(next.tok.w.canon, nextToken);
+    }
  
    // Determine the casing of the next token. We randomly make the token all
    // caps based on the markov chain. Otherwise, we check if the previous
@@ -600,8 +662,7 @@ std::string kgramstats::randomSentence(int maxL)
    }
                
    /* DEBUG */
-    printKgram(cur);
+    std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
    cur.push_back(next.tok);
                
@@ -633,29 +694,7 @@ std::string kgramstats::randomSentence(int maxL)
    open_delimiters.pop();
  }
  
-  // Replace old-style freevars while I can't be bothered to remake the corpus yet
+  result.resize(maxL);
-  std::vector<std::string> fv_names;
-  std::ifstream namefile("names.txt");
-  if (namefile.is_open())
-  {
-    while (!namefile.eof())
-    {
-      std::string l;
-      getline(namefile, l);
-      if (l.back() == '\r')
-      {
-        l.pop_back();
-      }
-      
-      fv_names.push_back(l);
-    }
-  
-    int cpos;
-    while ((cpos = result.find("$name$")) != std::string::npos)
-    {
-      result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
-    }
-  }
        
  return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index 5fad37d..ee75ada 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -1,124 +1,135 @@
+#ifndef KGRAMSTATS_H
+#define KGRAMSTATS_H
 #include <string>
 #include <map>
 #include <list>
 #include <vector>
 #include "histogram.h"
+#include <functional>
-#ifndef KGRAMSTATS_H
+class rawr {
-#define KGRAMSTATS_H
+  public:
+    typedef std::function<std::string(std::string, std::string)> transform_callback;
-struct word {
+    
-  std::string canon;
+    void addCorpus(std::string corpus);
-  histogram<std::string> forms;
+    void compile(int maxK);
-  histogram<std::string> terms;
+    
+    void setTransformCallback(transform_callback _arg);
+        std::string randomSentence(int maxL);
+        
+  private:
+    struct word {
+      std::string canon;
+      histogram<std::string> forms;
+      histogram<std::string> terms;
  
-  word(std::string canon) : canon(canon) {}
+      word(std::string canon) : canon(canon) {}
  
-  bool operator<(const word& other) const
+      bool operator<(const word& other) const
-  {
+      {
-    return canon < other.canon;
+        return canon < other.canon;
-  }
+      }
-};
+    };
-extern word blank_word;
-enum class suffixtype {
+    enum class suffixtype {
-  none,
+      none,
-  terminating,
+      terminating,
-  comma
+      comma
-};
+    };
-enum class parentype {
+    enum class parentype {
-  paren,
+      paren,
-  square_bracket,
+      square_bracket,
-  asterisk,
+      asterisk,
-  quote
+      quote
-};
+    };
-enum class doublestatus {
+    enum class doublestatus {
-  opening,
+      opening,
-  closing,
+      closing,
-  both
+      both
-};
+    };
-struct delimiter {
+    struct delimiter {
-  parentype type;
+      parentype type;
-  doublestatus status;
+      doublestatus status;
  
-  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
+      delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  bool operator<(const delimiter& other) const
+      bool operator<(const delimiter& other) const
-  {
+      {
-    return std::tie(type, status) < std::tie(other.type, other.status);
+        return std::tie(type, status) < std::tie(other.type, other.status);
-  }
+      }
-};
+    };
-struct token {
+    struct token {
-  const word& w;
+      const word& w;
-  std::map<delimiter, int> delimiters;
+      std::map<delimiter, int> delimiters;
-  suffixtype suffix;
+      suffixtype suffix;
-  std::string raw;
+      std::string raw;
    
-  token(const word& w) : w(w), suffix(suffixtype::none) {}
+      token(const word& w) : w(w), suffix(suffixtype::none) {}
  
-  bool operator<(const token& other) const
+      bool operator<(const token& other) const
-  {
+      {
-    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
+        return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-  }
+      }
-};
+    };
-enum class querytype {
+    enum class querytype {
-  literal,
+      literal,
-  sentence
+      sentence
-};
+    };
-struct query {
+    struct query {
-  querytype type;
+      querytype type;
-  token tok;
+      token tok;
  
-  query(token tok) : tok(tok), type(querytype::literal) {}
+      query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : tok(blank_word), type(type) {}
+      query(querytype type) : tok(blank_word), type(type) {}
  
-  bool operator<(const query& other) const
+      bool operator<(const query& other) const
-  {
+      {
-    if (type == other.type)
+        if (type == other.type)
-    {
+        {
-      return tok < other.tok;
+          return tok < other.tok;
-    } else {
+        } else {
-      return type < other.type;
+          return type < other.type;
-    }
+        }
-  }
+      }
-};
+    };
+    
-typedef std::list<query> kgram;
+    static const query wildcardQuery;
+    static const word blank_word;
-class kgramstats
+    typedef std::list<query> kgram;
-{
-public:
-        kgramstats(std::string corpus, int maxK);
-        std::string randomSentence(int maxL);
-        
-private:
-        struct token_data
-        {
-                int all;
-                int titlecase;
-                int uppercase;
-    token tok;
    
-    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
+        struct token_data
-        };
+        {
+                int all;
+                int titlecase;
+                int uppercase;
+      token tok;
+    
+      token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
+        };
+    
+    friend std::ostream& operator<<(std::ostream& os, kgram k);
+    friend std::ostream& operator<<(std::ostream& os, query q);
+    friend std::ostream& operator<<(std::ostream& os, token t);
  
-        int maxK;
+        int _maxK;
-        std::map<kgram, std::map<int, token_data> > stats;
+    bool _compiled = false; 
+    std::vector<std::string> _corpora;
+        std::map<kgram, std::map<int, token_data>> _stats;
+    transform_callback _transform;
  
-  // Words
+    // Words
-  std::map<std::string, word> words;
+    std::map<std::string, word> words;
-  word hashtags {"#hashtag"};
+    word hashtags {"#hashtag"};
-  word emoticons {"👌"};
+    word emoticons {"👌"};
 };
-void printKgram(kgram k);
 #endif
 \ No newline at end of file
diff --git a/rawr.h b/rawr.h
new file mode 100644
index 0000000..2b5daf7
--- /dev/null
+++ b/rawr.h

@@ -0,0 +1,6 @@
+#ifndef RAWR_H_E903544C
+#define RAWR_H_E903544C
+#include "kgramstats.h"
+#endif /* end of include guard: RAWR_H_E903544C */