Rewrote how tokens are handled

A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
commit: b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree: f21bd883ef7c4255a91d096ea105feaad135ee52
parent: fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download: rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
8 files changed, 406 insertions, 266 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index aa63a34..41c4552 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -8,10 +8,14 @@ find_package(curl)
 if (YamlCpp_FOUND AND CURL_FOUND)
  add_subdirectory(vendor/twitcurl/libtwitcurl)
  include_directories(vendor/twitcurl/libtwitcurl)
-  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+  add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
+  set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
  target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
 else (YamlCpp_FOUND AND CURL_FOUND)
  message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
 endif (YamlCpp_FOUND AND CURL_FOUND)
-add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp)
+add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
+set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp
index e38ebab..ed1e080 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -44,20 +44,9 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
+    std::string doc = stats->randomSentence(rand() % 45 + 5);
-    std::string hi;
+    std::string hi = vars->parse(doc);
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
    hi.resize(140);
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
    
    std::string replyMsg;
    if (twitter.statusUpdate(hi))
diff --git a/freevars.cpp b/freevars.cpp
index 8c3eda4..54c5aab 100644
--- a/freevars.cpp
+++ b/freevars.cpp

@@ -34,8 +34,8 @@ std::string freevars::parse(std::string in)
    for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
    {
        std::string tofind = "$" + it->first + "$";
-        size_t fpos = res.find(tofind);
+        size_t fpos;
-        if (fpos != std::string::npos)
+        while ((fpos = res.find(tofind)) != std::string::npos)
        {
            int r = rand() % it->second->size();
            res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
diff --git a/gen.cpp b/gen.cpp
index 400c0a5..a0ef8e3 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -52,21 +52,10 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
+    std::string doc = stats->randomSentence(rand() % 35 + 15);
-    std::string hi;
+    std::string hi = vars->parse(doc);
-    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-    {
-      hi += vars->parse(*it) + " ";
-    }
-    
    hi.resize(140);
-    size_t lastperiod = hi.find_last_of(".!?,");
-    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-    {
-      hi = hi.substr(0, lastperiod+1);
-    }
    std::cout << hi << std::endl;
                
    getc(stdin);
diff --git a/histogram.cpp b/histogram.cpp
new file mode 100644
index 0000000..6896146
--- /dev/null
+++ b/histogram.cpp

@@ -0,0 +1,34 @@
+#include "histogram.h"
+#include <cstdlib>
+template <class T>
+void histogram<T>::add(const T& inst)
+{
+  freqtable[inst]++;
+}
+template <class T>
+void histogram<T>::compile()
+{
+  distribution.clear();
+  
+  int max = 0;
+  for (auto& it : freqtable)
+  {
+    max += it.second;
+    distribution.emplace(max, it.first);
+  }
+  
+  freqtable.clear();
+}
+template <class T>
+const T& histogram<T>::next() const
+{
+  int max = distribution.rbegin()->first;
+  int r = rand() % max;
+  
+  return distribution.upper_bound(r)->second;
+}
+template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h
new file mode 100644
index 0000000..5aa2560
--- /dev/null
+++ b/histogram.h

@@ -0,0 +1,19 @@
+#ifndef HISTOGRAM_H_24094D97
+#define HISTOGRAM_H_24094D97
+#include <map>
+#include <string>
+template <class T>
+class histogram {
+  public:
+    void add(const T& inst);
+    void compile();
+    const T& next() const;
+    
+  private:
+    std::map<T, int> freqtable;
+    std::map<int, T> distribution;
+};
+#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 4bb7f15..0ab0c99 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -37,35 +37,11 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
-#include "malaprop.h"
+#include <set>
+#include <stack>
-query wildcardQuery(querytype_sentence);
+query wildcardQuery {querytype::sentence};
+word blank_word {""};
-std::string canonize(std::string f);
-token token_from_string(std::string in)
-{
-  if (in[0] == '#')
-  {
-    token word(tokentype_hashtag);
-    
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  } else {
-    token word(canonize(in));
-  
-    if (in.find_first_of(".?!,") != std::string::npos)
-    {
-      word.terminating = true;
-    }
-    
-    return word;
-  }
-}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
@@ -73,7 +49,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
 {
  this->maxK = maxK;
  
-  std::vector<std::string> tokens;
+  std::vector<token> tokens;
  size_t start = 0;
  int end = 0;
  std::set<std::string> thashtags;
@@ -82,88 +58,186 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  {
    end = corpus.find(" ", start);
-    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-    if (token[token.length()-1] == '\n')
+    if (t.compare("") && t.compare("."))
    {
-      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
+      std::string tc(t), canonical;
+      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
+      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
+        return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+      });
+      
+      word& w = ([&] () -> word& {
+        // Hashtag freevar
+        if (canonical[0] == '#')
+        {
+          thashtags.insert(canonical);
+          canonical = "#hashtag";
+          
+          return hashtags;
+        }
+        
+        // Basically any other word
+        if (words.count(canonical) == 0)
+        {
+          words.emplace(canonical, canonical);
+        }
+        
+        word& tw = words.at(canonical);
+        tw.forms.add(canonical);
+        
+        return tw;
+      })();
+      
+      token tk(w);
+      tk.raw = t;
+      
+      for (char c : t)
      {
-        token.insert(token.length()-1, ".");
+        if (c == '*')
+        {
+          tk.delimiters[{parentype::asterisk, doublestatus::opening}]++;
+        } else if (c == '[')
+        {
+          tk.delimiters[{parentype::square_bracket, doublestatus::opening}]++;
+        } else if (c == '(')
+        {
+          tk.delimiters[{parentype::paren, doublestatus::opening}]++;
+        } else if (c == '"')
+        {
+          tk.delimiters[{parentype::quote, doublestatus::opening}]++;
+        } else {
+          break;
+        }
      }
-         
-      token.resize(token.length()-1);
-    }
-       
-    if (token.compare("") && token.compare("."))
-    {
-      mstats.addWord(token);
-      tokens.push_back(token);
      
-      if (token[0] == '#')
+      int backtrack = t.find_last_not_of(".,?!])\"*\n") + 1;
+      if (backtrack != t.length())
      {
-        thashtags.insert(canonize(token));
+        std::string ending = t.substr(backtrack);
+        std::string suffix;
+        
+        for (char c : ending)
+        {
+          if ((c == '.') || (c == ',') || (c == '?') || (c == '!'))
+          {
+            suffix += c;
+            
+            continue;
+          } else if (c == '\n')
+          {
+            // At least the end is coming
+            if (suffix.empty())
+            {
+              suffix = ".";
+            }
+            
+            break;
+          }
+          
+          parentype pt = ([&] {
+            switch (c)
+            {
+              case ']': return parentype::square_bracket;
+              case ')': return parentype::paren;
+              case '*': return parentype::asterisk;
+              case '"': return parentype::quote;
+            }
+          })();
+          
+          if (tk.delimiters[{pt, doublestatus::opening}] > 0)
+          {
+            tk.delimiters[{pt, doublestatus::opening}]--;
+            tk.delimiters[{pt, doublestatus::both}]++;
+          } else {
+            tk.delimiters[{pt, doublestatus::closing}]++;
+          }
+        }
+        
+        if (suffix == ",")
+        {
+          tk.suffix = suffixtype::comma;
+        } else if (!suffix.empty()) {
+          tk.suffix = suffixtype::terminating;
+          
+          w.terms.add(suffix);
+        }
      }
+      
+      tokens.push_back(tk);
    }
    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
  }
  
-  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  // Time to condense the distribution stuff for the words
+  for (auto& it : words)
  {
-    hashtags.push_back(*it);
+    it.second.forms.compile();
+    it.second.terms.compile();
  }
-        
+  
+  // Hashtag freevar is not frequency distributed
+  for (auto& it : thashtags)
+  {
+    hashtags.forms.add(it);
+  }
+  
+  hashtags.forms.compile();
+  hashtags.terms.compile();
+  // kgram distribution
  std::map<kgram, std::map<token, token_data> > tstats;
-  std::map<token, std::map<termstats, int> > tendings;
  for (int k=1; k<maxK; k++)
  {
    for (int i=0; i<(tokens.size() - k); i++)
    {
-      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix(tokens.begin()+i, tokens.begin()+i+k);
-      kgram prefix;
+      token f = tokens[i+k];
-      
-      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
+      if (tstats[prefix].count(f) == 0)
-      {
-        prefix.push_back(token_from_string(*it));
-      }
-      
-      std::string f = tokens[i+k];
-                  std::string canonical = canonize(f);
-      
-      token word(token_from_string(canonical));
-      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        word.terminating = true;
+        tstats[prefix].emplace(f, f);
-        
-        char terminator = f[f.find_last_of(".?!,")];
-        int occurrences = std::count(f.begin(), f.end(), terminator);
-        
-        tendings[word][termstats(terminator, occurrences)]++;
      }
                        
-      token_data& td = tstats[prefix][word];
+      token_data& td = tstats[prefix].at(f);
-      td.word = word;
      td.all++;
-      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+      if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
      {
        td.uppercase++;
-      } else if (isupper(f[0]))
+      } else if (isupper(f.raw[0]))
      {
        td.titlecase++;
      }
      
-      if (prefix.front().word.terminating)
+      kgram term_prefix;
+      bool changed = false;
+      std::transform(prefix.begin(), prefix.end(), std::back_inserter(term_prefix), [&] (query& q) {
+        if (q.tok.suffix == suffixtype::terminating)
+        {
+          changed = true;
+          
+          return wildcardQuery;
+        } else {
+          return q;
+        }
+      });
+      
+      if (changed)
      {
-        prefix.front() = wildcardQuery;
+        if (tstats[term_prefix].count(f) == 0)
+        {
+          tstats[term_prefix].emplace(f, f);
+        }
        
-        token_data& td2 = tstats[prefix][word];
+        token_data& td2 = tstats[term_prefix].at(f);
-        td2.word = word;
        td2.all++;
-        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
+        if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end())
        {
          td2.uppercase++;
-        } else if (isupper(f[0]))
+        } else if (isupper(f.raw[0]))
        {
          td2.titlecase++;
        }
@@ -171,74 +245,52 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    }
  }
        
-  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  // Condense the kgram distribution
+  for (auto& it : tstats)
  {
-    kgram klist = it->first;
+    kgram klist = it.first;
-    std::map<token, token_data>& probtable = it->second;
+    auto& probtable = it.second;
-    std::map<int, token_data>& distribution = stats[klist];
+    auto& distribution = stats[klist];
    int max = 0;
                
-    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    for (auto& kt : probtable)
    {
-      max += kt->second.all;
+      max += kt.second.all;
                        
-      distribution[max] = kt->second;
+      distribution.emplace(max, kt.second);
-    }
-  }
-  
-  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
-  {
-    token word = it->first;
-    std::map<termstats, int>& probtable = it->second;
-    std::map<int, termstats>& distribution = endings[word];
-    int max = 0;
-    
-    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-    {
-      max += kt->second;
-      
-      distribution[max] = kt->first;
    }
  }
 }
 void printKgram(kgram k)
 {
-  for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (auto& q : k)
  {
-    query& q = *it;
+    if (q.type == querytype::sentence)
-    if (q.type == querytype_sentence)
    {
      std::cout << "#.# ";
-    } else if (q.type == querytype_literal)
+    } else if (q.type == querytype::literal)
    {
-      if (q.word.type == tokentype_hashtag)
+      if (q.tok.suffix == suffixtype::terminating)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ". ";
-        {
+      } else if (q.tok.suffix == suffixtype::comma)
-          std::cout << "#hashtag. ";
-        } else {
-          std::cout << "#hashtag ";
-        }
-      } else if (q.word.type == tokentype_literal)
      {
-        if (q.word.terminating)
+        std::cout << q.tok.w.canon << ", ";
-        {
+      } else {
-          std::cout << q.word.canon << ". ";
+        std::cout << q.tok.w.canon << " ";
-        } else {
-          std::cout << q.word.canon << " ";
-        }
      }
    }
  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::vector<std::string> kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int n)
 {
-  std::vector<std::string> result;
+  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
+  std::stack<parentype> open_delimiters;
        
  for (int i=0; i<n; i++)
  {
@@ -273,86 +325,135 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      cur = kgram(1, wildcardQuery);
    }
-    std::map<int, token_data>& distribution = stats[cur];
+    auto& distribution = stats[cur];
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken;
+    std::string nextToken = next.tok.w.forms.next();
-    bool mess = false;
+  
-    
+    // Determine the casing of the next token. We randomly make the token all
-    if (next.word.type == tokentype_literal)
+    // caps based on the markov chain. Otherwise, we check if the previous
+    // token is the end of a sentence (terminating token or a wildcard query).
+    int casing = rand() % next.all;
+    if (casing < next.uppercase)
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype::sentence)
+          || ((cur.rbegin()->type == querytype::literal)
+            && (cur.rbegin()->tok.suffix == suffixtype::terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
-      nextToken = next.word.canon;
+      nextToken[0] = toupper(nextToken[0]);
+    }
    
-      mess = (rand() % 100) == 0;
+    // Delimiters
-      if (mess)
+    for (auto& dt : next.tok.delimiters)
+    {
+      if (dt.first.status == doublestatus::both)
      {
-        nextToken = mstats.alternate(nextToken);
+        switch (dt.first.type)
-      }
+        {
-    
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken + std::string(")", dt.second); break;
-      // Determine the casing of the next token. We randomly make the token all
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken + std::string("]", dt.second); break;
-      // caps based on the markov chain. Otherwise, we check if the previous
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken + std::string("*", dt.second); break;
-      // token is the end of a sentence (terminating token or a wildcard query).
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken + std::string("\"", dt.second); break;
-      int casing = rand() % next.all;
+        }
-      if (casing < next.uppercase)
+      } else if (dt.first.status == doublestatus::opening)
      {
-        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+        for (int i=0; i<dt.second; i++)
-      } else if ((((cur.rbegin()->type == querytype_sentence)
+        {
-            || ((cur.rbegin()->type == querytype_literal)
+          open_delimiters.push(dt.first.type);
-              && (cur.rbegin()->word.terminating)))
+        }
-          && (rand() % 2 > 0))
+        
-        || (casing - next.uppercase < next.titlecase))
+        switch (dt.first.type)
+        {
+          case parentype::paren: nextToken = std::string("(", dt.second) + nextToken; break;
+          case parentype::square_bracket: nextToken = std::string("[", dt.second) + nextToken; break;
+          case parentype::asterisk: nextToken = std::string("*", dt.second) + nextToken; break;
+          case parentype::quote: nextToken = std::string("\"", dt.second) + nextToken; break;
+        }
+      } else if (dt.first.status == doublestatus::closing)
      {
-        nextToken[0] = toupper(nextToken[0]);
+        for (int i=0; i<dt.second; i++)
+        {
+          while (!open_delimiters.empty() && (open_delimiters.top() != dt.first.type))
+          {
+            switch (open_delimiters.top())
+            {
+              case parentype::paren: nextToken.append(")"); break;
+              case parentype::square_bracket: nextToken.append("]"); break;
+              case parentype::asterisk: nextToken.append("*"); break;
+              case parentype::quote: nextToken.append("\""); break;
+            }
+            
+            open_delimiters.pop();
+          }
+          
+          if (open_delimiters.empty())
+          {
+            switch (dt.first.type)
+            {
+              case parentype::paren: result = "(" + result; break;
+              case parentype::square_bracket: result = "[" + result; break;
+              case parentype::asterisk: result = "*" + result; break;
+              case parentype::quote: result = "\"" + result; break;
+            }
+          }
+          
+          switch (dt.first.type)
+          {
+            case parentype::paren: nextToken.append(")"); break;
+            case parentype::square_bracket: nextToken.append("]"); break;
+            case parentype::asterisk: nextToken.append("*"); break;
+            case parentype::quote: nextToken.append("\""); break;
+          }
+        }
      }
-    } else if (next.word.type == tokentype_hashtag)
-    {
-      int rhash = rand() % hashtags.size();
-      nextToken = hashtags[rhash];
    }
    
-    if (next.word.terminating)
+    // Terminators
+    if (next.tok.suffix == suffixtype::terminating)
    {
-      std::map<int, termstats>& ending = endings[next.word];
+      nextToken.append(next.tok.w.terms.next());
-      int emax = ending.rbegin()->first;
+    } else if (next.tok.suffix == suffixtype::comma)
-      int er = rand() % emax;
+    {
-      termstats& nextend = ending.upper_bound(er)->second;
+      nextToken.append(",");
-    
-      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
                
    /* DEBUG */
    printKgram(cur);
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl;
+    cur.push_back(next.tok);
                
-    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
+    result.append(nextToken + " ");
    
-    if (mess)
+    if ((next.tok.suffix == suffixtype::terminating) && (rand() % 4 == 0))
    {
-      std::cout << " mala " << next.word.canon;
+      break;
    }
-    
-    std::cout << std::endl;
-    cur.push_back(next.word);
-                
-    result.push_back(nextToken);
  }
-        
-  return result;
-}
-bool removeIf(char c)
-{
-  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
-}
-std::string canonize(std::string f)
-{
-  std::string canonical(f);
-  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  std::string result;
+  // Remove the trailing space
-  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  if (result.back() == ' ')
+  {
+    result.pop_back();
+  }
+  
+  // Close any open delimiters
+  while (!open_delimiters.empty())
+  {
+    switch (open_delimiters.top())
+    {
+      case parentype::paren: result.append(")"); break;
+      case parentype::square_bracket: result.append("]"); break;
+      case parentype::asterisk: result.append("*"); break;
+      case parentype::quote: result.append("\""); break;
+    }
+    
+    open_delimiters.pop();
+  }
        
  return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index ff2fc66..a97d7bf 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -2,61 +2,89 @@
 #include <map>
 #include <list>
 #include <vector>
-#include "malaprop.h"
+#include "histogram.h"
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-enum tokentype {
+struct word {
-  tokentype_literal,
+  std::string canon;
-  tokentype_hashtag
+  histogram<std::string> forms;
+  histogram<std::string> terms;
+  
+  word(std::string canon) : canon(canon) {}
+  
+  bool operator<(const word& other) const
+  {
+    return canon < other.canon;
+  }
 };
-struct token {
+extern word blank_word;
-  tokentype type;
-  std::string canon;
+enum class suffixtype {
-  bool terminating;
+  none,
+  terminating,
+  comma
+};
+enum class parentype {
+  paren,
+  square_bracket,
+  asterisk,
+  quote
+};
+enum class doublestatus {
+  opening,
+  closing,
+  both
+};
+struct delimiter {
+  parentype type;
+  doublestatus status;
+  
+  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
+  bool operator<(const delimiter& other) const
-  token(tokentype type) : type(type), canon(""), terminating(false) {}
+  {
+    return std::tie(type, status) < std::tie(other.type, other.status);
+  }
+};
+struct token {
+  const word& w;
+  std::map<delimiter, int> delimiters;
+  suffixtype suffix;
+  std::string raw;
+    
+  token(const word& w) : w(w), suffix(suffixtype::none) {}
  
  bool operator<(const token& other) const
  {
-    if (type != other.type)
+    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-    {
-      return type < other.type;
-    } else if (type == tokentype_literal)
-    {
-      if (canon == other.canon)
-      {
-        return !terminating && other.terminating;
-      } else {
-        return canon < other.canon;
-      }
-    } else {
-      return !terminating && other.terminating;
-    }
  }
 };
-enum querytype {
+enum class querytype {
-  querytype_literal,
+  literal,
-  querytype_sentence
+  sentence
 };
 struct query {
  querytype type;
-  token word;
+  token tok;
  
-  query(token word) : word(word), type(querytype_literal) {}
+  query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : word(""), type(type) {}
+  query(querytype type) : tok(blank_word), type(type) {}
  
  bool operator<(const query& other) const
  {
    if (type == other.type)
    {
-      return word < other.word;
+      return tok < other.tok;
    } else {
      return type < other.type;
    }
@@ -65,34 +93,11 @@ struct query {
 typedef std::list<query> kgram;
-struct termstats {
-  char terminator;
-  int occurrences;
-  
-  termstats() : terminator('.'), occurrences(1) {}
-  
-  termstats(char terminator, int occurrences)
-  {
-    this->terminator = terminator;
-    this->occurrences = occurrences;
-  }
-  
-  bool operator<(const termstats& other) const
-  {
-    if (terminator == other.terminator)
-    {
-      return occurrences < other.occurrences;
-    } else {
-      return terminator < other.terminator;
-    }
-  }
-};
 class kgramstats
 {
 public:
        kgramstats(std::string corpus, int maxK);
-        std::vector<std::string> randomSentence(int n);
+        std::string randomSentence(int n);
        
 private:
        struct token_data
@@ -100,16 +105,15 @@ private:
                int all;
                int titlecase;
                int uppercase;
-    token word;
+    token tok;
    
-    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
+    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
        };
  
        int maxK;
        std::map<kgram, std::map<int, token_data> > stats;
-  malaprop mstats;
+  word hashtags {"#hashtag"};
-  std::map<token, std::map<int, termstats> > endings;
+  std::map<std::string, word> words;
-  std::vector<std::string> hashtags;
 };
 void printKgram(kgram k);
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
commit	b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree	f21bd883ef7c4255a91d096ea105feaad135ee52
parent	fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download	rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip