Rewrote quite a bit of kgramstats

The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses). The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
commit: 9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree: 9afb52740fe4f618105d014a816df26b36ed83f6
parent: 0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download: rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
5 files changed, 444 insertions, 501 deletions
diff --git a/ebooks.cpp b/ebooks.cpp
index 27591f4..a24bd8d 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -14,174 +14,168 @@
 int main(int argc, char** args)
 {
-        srand(time(NULL));
+  srand(time(NULL));
        
-        YAML::Node config = YAML::LoadFile("config.yml");
+  YAML::Node config = YAML::LoadFile("config.yml");
-    int delay = config["delay"].as<int>();
+  int delay = config["delay"].as<int>();
-        std::ifstream infile(config["corpus"].as<std::string>().c_str());
+  std::ifstream infile(config["corpus"].as<std::string>().c_str());
-        std::string corpus;
+  std::string corpus;
-        std::string line;
+  std::string line;
-        while (getline(infile, line))
+  while (getline(infile, line))
-        {
+  {
-                corpus += line + "\n ";
+    corpus += line + "\n ";
-        }
+  }
-    std::cout << "Preprocessing corpus..." << std::endl;
+  std::cout << "Preprocessing corpus..." << std::endl;
-        kgramstats* stats = new kgramstats(corpus, 4);
+  kgramstats* stats = new kgramstats(corpus, 4);
    
-    std::cout << "Preprocessing freevars..." << std::endl;
+  std::cout << "Preprocessing freevars..." << std::endl;
-    freevars* vars = new freevars();
+  freevars* vars = new freevars();
-    vars->addVar("name", "names.txt");
+  vars->addVar("name", "names.txt");
-    vars->addVar("noun", "nouns.txt");
+  vars->addVar("noun", "nouns.txt");
-    std::cout << "Generating..." << std::endl;
+  std::cout << "Generating..." << std::endl;
-        for (;;)
+  for (;;)
-        {
+  {
-                std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
+    std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
-                std::string hi;
+    std::string hi;
-                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
+    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-                {
-                        hi += vars->parse(*it) + " ";
-                }
-    size_t firstperiod = hi.find_first_of(".!?");
-    if (firstperiod != std::string::npos)
    {
-      hi = hi.substr(firstperiod+2);
+      hi += vars->parse(*it) + " ";
    }
-    
    hi.resize(140);
-                size_t lastperiod = hi.find_last_of(".!?");
+    size_t lastperiod = hi.find_last_of(".!?,");
-                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
+    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-                {
+    {
-                        hi = hi.substr(0, lastperiod+1);
+      hi = hi.substr(0, lastperiod+1);
-                }
+    }
-            twitCurl twitterObj;
+    twitCurl twitterObj;
-            std::string tmpStr, tmpStr2;
+    std::string tmpStr, tmpStr2;
-            std::string replyMsg;
+    std::string replyMsg;
-            char tmpBuf[1024];
+    char tmpBuf[1024];
-                std::string username(config["username"].as<std::string>());
+    std::string username(config["username"].as<std::string>());
-                std::string password(config["password"].as<std::string>());
+    std::string password(config["password"].as<std::string>());
-            /* Set twitter username and password */
+    /* Set twitter username and password */
-            twitterObj.setTwitterUsername(username);
+    twitterObj.setTwitterUsername(username);
-            twitterObj.setTwitterPassword(password);
+    twitterObj.setTwitterPassword(password);
        
-            /* OAuth flow begins */
+    /* OAuth flow begins */
-            /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
+    /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
-            twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
+    twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
-            twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
+    twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
-            /* Step 1: Check if we alredy have OAuth access token from a previous run */
+    /* Step 1: Check if we alredy have OAuth access token from a previous run */
-            std::string myOAuthAccessTokenKey("");
+    std::string myOAuthAccessTokenKey("");
-            std::string myOAuthAccessTokenSecret("");
+    std::string myOAuthAccessTokenSecret("");
-            std::ifstream oAuthTokenKeyIn;
+    std::ifstream oAuthTokenKeyIn;
-            std::ifstream oAuthTokenSecretIn;
+    std::ifstream oAuthTokenSecretIn;
-            oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
+    oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
-            oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
+    oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
-            memset( tmpBuf, 0, 1024 );
+    memset( tmpBuf, 0, 1024 );
-            oAuthTokenKeyIn >> tmpBuf;
+    oAuthTokenKeyIn >> tmpBuf;
-            myOAuthAccessTokenKey = tmpBuf;
+    myOAuthAccessTokenKey = tmpBuf;
-            memset( tmpBuf, 0, 1024 );
+    memset( tmpBuf, 0, 1024 );
-            oAuthTokenSecretIn >> tmpBuf;
+    oAuthTokenSecretIn >> tmpBuf;
-            myOAuthAccessTokenSecret = tmpBuf;
+    myOAuthAccessTokenSecret = tmpBuf;
-            oAuthTokenKeyIn.close();
+    oAuthTokenKeyIn.close();
-            oAuthTokenSecretIn.close();
+    oAuthTokenSecretIn.close();
-            if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
+    if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
-            {
+    {
-                /* If we already have these keys, then no need to go through auth again */
+      /* If we already have these keys, then no need to go through auth again */
-                printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
+      printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
-                twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
+      twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
-                twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
+      twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
-            }
+    }
-            else
+    else
-            {
+    {
-                /* Step 2: Get request token key and secret */
+      /* Step 2: Get request token key and secret */
-                std::string authUrl;
+      std::string authUrl;
-                twitterObj.oAuthRequestToken( authUrl );
+      twitterObj.oAuthRequestToken( authUrl );
-                /* Step 3: Get PIN  */
+      /* Step 3: Get PIN  */
-                memset( tmpBuf, 0, 1024 );
+      memset( tmpBuf, 0, 1024 );
-                printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
+      printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
-                gets( tmpBuf );
+      gets( tmpBuf );
-                tmpStr = tmpBuf;
+      tmpStr = tmpBuf;
-                if( std::string::npos != tmpStr.find( "1" ) )
+      if( std::string::npos != tmpStr.find( "1" ) )
-                {
+      {
-                    /* Ask user to visit twitter.com auth page and get PIN */
+        /* Ask user to visit twitter.com auth page and get PIN */
-                    memset( tmpBuf, 0, 1024 );
+        memset( tmpBuf, 0, 1024 );
-                    printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
+        printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
-                    printf( "\nEnter the PIN provided by twitter: " );
+        printf( "\nEnter the PIN provided by twitter: " );
-                    gets( tmpBuf );
+        gets( tmpBuf );
-                    tmpStr = tmpBuf;
+        tmpStr = tmpBuf;
-                    twitterObj.getOAuth().setOAuthPin( tmpStr );
+        twitterObj.getOAuth().setOAuthPin( tmpStr );
-                }
+      }
-                else
+      else
-                {
+      {
-                    /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
+        /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
-                    twitterObj.oAuthHandlePIN( authUrl );
+        twitterObj.oAuthHandlePIN( authUrl );
-                }
+      }
-                /* Step 4: Exchange request token with access token */
+      /* Step 4: Exchange request token with access token */
-                twitterObj.oAuthAccessToken();
+      twitterObj.oAuthAccessToken();
-                /* Step 5: Now, save this access token key and secret for future use without PIN */
+      /* Step 5: Now, save this access token key and secret for future use without PIN */
-                twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
+      twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
-                twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
+      twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
-                /* Step 6: Save these keys in a file or wherever */
+      /* Step 6: Save these keys in a file or wherever */
-                std::ofstream oAuthTokenKeyOut;
+      std::ofstream oAuthTokenKeyOut;
-                std::ofstream oAuthTokenSecretOut;
+      std::ofstream oAuthTokenSecretOut;
-                oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
+      oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
-                oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
+      oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
-                oAuthTokenKeyOut.clear();
+      oAuthTokenKeyOut.clear();
-                oAuthTokenSecretOut.clear();
+      oAuthTokenSecretOut.clear();
-                oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
+      oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
-                oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
+      oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
-                oAuthTokenKeyOut.close();
+      oAuthTokenKeyOut.close();
-                oAuthTokenSecretOut.close();
+      oAuthTokenSecretOut.close();
-            }
+    }
-            /* OAuth flow ends */
+    /* OAuth flow ends */
-            /* Account credentials verification */
+    /* Account credentials verification */
-            if( twitterObj.accountVerifyCredGet() )
+    if( twitterObj.accountVerifyCredGet() )
-            {
+    {
-                twitterObj.getLastWebResponse( replyMsg );
+      twitterObj.getLastWebResponse( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
-            }
+    }
-            else
+    else
-            {
+    {
-                twitterObj.getLastCurlError( replyMsg );
+      twitterObj.getLastCurlError( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
-            }
+    }
        
-            /* Post a new status message */
+    /* Post a new status message */
-            replyMsg = "";
+    replyMsg = "";
-            if( twitterObj.statusUpdate( hi ) )
+    if( twitterObj.statusUpdate( hi ) )
-            {
+    {
-                twitterObj.getLastWebResponse( replyMsg );
+      twitterObj.getLastWebResponse( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
-            }
+    }
-            else
+    else
-            {
+    {
-                twitterObj.getLastCurlError( replyMsg );
+      twitterObj.getLastCurlError( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
-            }
+    }
-                sleep(rand() % delay);
+    sleep(rand() % delay);
-        }
+  }
        
-        return 0;
+  return 0;
 }
diff --git a/gen.cpp b/gen.cpp
index 7e47d45..400c0a5 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -11,72 +11,66 @@
 int main(int argc, char** args)
 {
-        srand(time(NULL));
+  srand(time(NULL));
    
-    if (argc == 1)
+  if (argc == 1)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::ifstream infile(args[1]);
+  std::ifstream infile(args[1]);
-    if (!infile)
+  if (!infile)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
-        std::cout << std::endl;
+    std::cout << std::endl;
-        std::cout << "The file you specified does not exist." << std::endl;
+    std::cout << "The file you specified does not exist." << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::string corpus;
+  std::string corpus;
-        std::string line;
+  std::string line;
-        while (getline(infile, line))
+  while (getline(infile, line))
-        {
+  {
-                corpus += line + "\n ";
+    corpus += line + "\n ";
-        }
+  }
        
-    std::cout << "Preprocessing corpus..." << std::endl;
+  std::cout << "Preprocessing corpus..." << std::endl;
-        kgramstats* stats = new kgramstats(corpus, 4);
+  kgramstats* stats = new kgramstats(corpus, 4);
    
-    std::cout << "Preprocessing freevars..." << std::endl;
+  std::cout << "Preprocessing freevars..." << std::endl;
-    freevars* vars = new freevars();
+  freevars* vars = new freevars();
-    vars->addVar("name", "names.txt");
+  vars->addVar("name", "names.txt");
-    vars->addVar("noun", "nouns.txt");
+  vars->addVar("noun", "nouns.txt");
    
-    std::cout << "Generating..." << std::endl;
+  std::cout << "Generating..." << std::endl;
-        for (;;)
+  for (;;)
-        {
+  {
-                std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45);
+    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
-                std::string hi;
+    std::string hi;
-                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
+    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-                {
-                        hi += vars->parse(*it) + " ";
-                }
-    
-    size_t firstperiod = hi.find_first_of(".!?");
-    if (firstperiod != std::string::npos)
    {
-      hi = hi.substr(firstperiod+2);
+      hi += vars->parse(*it) + " ";
    }
    
    hi.resize(140);
-                size_t lastperiod = hi.find_last_of(".!?");
+    size_t lastperiod = hi.find_last_of(".!?,");
-                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
+    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-                {
+    {
-                        hi = hi.substr(0, lastperiod+1);
+      hi = hi.substr(0, lastperiod+1);
-                }
+    }
-                std::cout << hi << std::endl;
+    std::cout << hi << std::endl;
                
-        getc(stdin);
+    getc(stdin);
-        }
+  }
        
-        return 0;
+  return 0;
 }
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b0ec68a..c88d83c 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -5,237 +5,176 @@
 #include <algorithm>
 #include "malaprop.h"
+query wildcardQuery(querytype_sentence);
 std::string canonize(std::string f);
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
 kgramstats::kgramstats(std::string corpus, int maxK)
 {
-        this->maxK = maxK;
+  this->maxK = maxK;
  
  std::vector<std::string> tokens;
-    size_t start = 0;
+  size_t start = 0;
-        int end = 0;
+  int end = 0;
-        while (end != std::string::npos)
+  while (end != std::string::npos)
-        {
+  {
-           end = corpus.find(" ", start);
+    end = corpus.find(" ", start);
-       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-       if (token[token.length()-1] == '\n')
+    if (token[token.length()-1] == '\n')
-       {
+    {
-         if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))
+      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
-         {
+      {
-           token.insert(token.length()-1, ".");
+        token.insert(token.length()-1, ".");
-         }
+      }
         
-         token.resize(token.length()-1);
+      token.resize(token.length()-1);
-       }
+    }
       
-       if (token.compare("") && token.compare("."))
+    if (token.compare("") && token.compare("."))
-       {
+    {
-         mstats.addWord(token);
+      mstats.addWord(token);
-           tokens.push_back(token);
+      tokens.push_back(token);
-       }
+    }
-           start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
-        }
+  }
        
-        std::map<kgram, std::map<std::string, token_data*>* > tstats;
+  std::map<kgram, std::map<token, token_data> > tstats;
-  bool newSentence = true;
+  std::map<token, std::map<termstats, int> > tendings;
-  bool newClause = false;
+  for (int k=1; k<maxK; k++)
-        for (int k=0; k<maxK; k++)
+  {
-        {
+    for (int i=0; i<(tokens.size() - k); i++)
-                for (int i=0; i<(tokens.size() - k); i++)
+    {
-                {
+      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
-                        kgram seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix;
-                        std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
-                        std::string f = tokens[i+k];
-                        
-      
-      
-      std::string canonical = canonize(f);
-                        
-                        if (tstats[seq] == NULL)
-                        {
-                                tstats[seq] = new std::map<std::string, token_data*>();
-                        }
-                        
-                        if ((*tstats[seq])[canonical] == NULL)
-                        {
-                                (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
-                        }
-                        token_data* td = tstats[seq]->at(canonical);
-                        td->token = new std::string(canonical);
-                        td->all++;
      
-      /*if (newSentence)
+      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
      {
-        kgram newKgram(1, ".");
+        token word(canonize(*it));
-        if (tstats[newKgram] == NULL)
+        
+        if (it->find_first_of(".?!,") != std::string::npos)
        {
-          tstats[newKgram] = new std::map<std::string, token_data*>();
+          word.terminating = true;
        }
        
-        (*tstats[newKgram])[canonical] = td;
+        prefix.push_back(word);
-        
-        newSentence = false;
      }
      
-      if (newClause)
+      std::string f = tokens[i+k];
+                  std::string canonical = canonize(f);
+      
+      token word(canonical);
+      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        kgram commaKgram(1, ",");
+        word.terminating = true;
-        if (tstats[commaKgram] == NULL)
-        {
-          tstats[commaKgram] = new std::map<std::string, token_data*>();
-        }
        
-        (*tstats[commaKgram])[canonical] = td;
+        char terminator = f[f.find_last_of(".?!,")];
+        int occurrences = std::count(f.begin(), f.end(), terminator);
        
-        newClause = false;
+        tendings[word][termstats(terminator, occurrences)]++;
-      }
-      
-      if ((f.length() > 0) && (f[f.length()-1] == '\n'))
-      {
-        td->period++;
-        newSentence = true;
-        f.resize(f.length()-1);
      }
                        
-      if (f.length() > 0)
+      token_data& td = tstats[prefix][word];
+      td.word = word;
+      td.all++;
+      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
      {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
+        td.uppercase++;
-                        {
+      } else if (isupper(f[0]))
-          if (!newSentence)
+      {
-          {
+        td.titlecase++;
-                                td->period++;
-            newSentence = true;
-          }
-                                
-          f.resize(f.length()-1);
-                        } else if (f[f.length()-1] == ',')
-        {
-          if (!newSentence)
-          {
-            td->comma++;
-            newClause = true;
-          }
-          
-          f.resize(f.length()-1);
-        }
      }
      
-      if (f.length() > 0)
+      if (prefix.front().word.terminating)
      {
-        if (f[0] == '"')
+        prefix.front() = wildcardQuery;
-        {
-          td->startquote++;
-        }
        
-        if (f[0] == '(')
+        token_data& td2 = tstats[prefix][word];
+        td2.word = word;
+        td2.all++;
+        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
        {
-          td->startparen++;
+          td2.uppercase++;
-        }
+        } else if (isupper(f[0]))
-        
-        if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
        {
-          if (f[f.length()-1] == '"')
+          td2.titlecase++;
-          {
-            td->endquote++;
-          } else if (f[f.length()-1] == ')')
-          {
-            td->endparen++;
-          }
-          
-          f.resize(f.length()-1);
-          
-          if (f.length() > 0)
-          {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
-                        {
-              if (!newSentence)
-              {
-                                        td->period++;
-                newSentence = true;
-              }
-                        } else if (f[f.length()-1] == ',')
-            {
-              if (!newSentence && !newClause)
-              {
-                td->comma++;
-                newClause = true;
-              }
-            }
-          }
-        }
-      }*/
-                        
-                        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
-                        {
-                                td->uppercase++;
-                        } else if (isupper(f[0]))
-                        {
-                                td->titlecase++;
-                        }
-      
-      /*if (k != 0)
-      {
-        if (newSentence)
-        {
-          i += k;
        }
-        
+      }
-        newSentence = false;
+    }
-        newClause = false;
+  }
-      }*/
-                }
-        }
        
-        stats = new std::map<kgram, std::map<int, token_data*>* >();
+  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
-        for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  {
-        {
+    kgram klist = it->first;
-                kgram klist = it->first;
+    std::map<token, token_data>& probtable = it->second;
-                std::map<std::string, token_data*>* probtable = it->second;
+    std::map<int, token_data>& distribution = stats[klist];
-                std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
+    int max = 0;
-        int max = 0;
                
-                for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-                {
+    {
-                        max += kt->second->all;
+      max += kt->second.all;
                        
-                        (*distribution)[max] = kt->second;
+      distribution[max] = kt->second;
-                }
+    }
-                
+  }
-                (*stats)[klist] = distribution;
+  
-        }
+  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
+  {
+    token word = it->first;
+    std::map<termstats, int>& probtable = it->second;
+    std::map<int, termstats>& distribution = endings[word];
+    int max = 0;
+    
+    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    {
+      max += kt->second;
+      
+      distribution[max] = kt->first;
+    }
+  }
 }
 void printKgram(kgram k)
 {
-        for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (kgram::iterator it = k.begin(); it != k.end(); it++)
-        {
+  {
-                std::cout << *it << " ";
+    query& q = *it;
-        }
+    if (q.type == querytype_sentence)
+    {
+      std::cout << "#.# ";
+    } else if (q.type == querytype_literal)
+    {
+      if (q.word.terminating)
+      {
+        std::cout << q.word.canon << ". ";
+      } else {
+        std::cout << q.word.canon << " ";
+      }
+    }
+  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
 std::vector<std::string> kgramstats::randomSentence(int n)
 {
-        std::vector<std::string> result;
+  std::vector<std::string> result;
-  kgram newKgram(1, ".");
+  kgram cur(1, wildcardQuery);
-  kgram commaKgram(1, ",");
-        std::list<std::string> cur;
  int cuts = 0;
        
-        for (int i=0; i<n; i++)
+  for (int i=0; i<n; i++)
-        {
+  {
-                if (cur.size() == maxK)
+    if (cur.size() == maxK)
-                {
+    {
-                        cur.pop_front();
+      cur.pop_front();
-                }
+    }
    
-    if ((cur.size() > 0) && (cur != newKgram))
+    if (cur.size() > 0)
    {
      if (rand() % (maxK - cur.size() + 1) == 0)
      {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      
      cuts++;
    }
+    
+    // Gotta circumvent the last line of the input corpus
+    // https://twitter.com/starla4444/status/684222271339237376
+    if (stats.count(cur) == 0)
+    {
+      cur = kgram(1, wildcardQuery);
+    }
-                std::map<int, token_data*> distribution = *(*stats)[cur];
+    std::map<int, token_data>& distribution = stats[cur];
-                int max = distribution.rbegin()->first;
+    int max = distribution.rbegin()->first;
-                int r = rand() % max;
+    int r = rand() % max;
-                token_data* next = distribution.upper_bound(r)->second;
+    token_data& next = distribution.upper_bound(r)->second;
+    std::string nextToken(next.word.canon);
-                std::string nextToken(*(next->token));
-                int casing = rand() % next->all;
-                /*int period = rand() % next->all;
-    int startparen = rand() % next->all;
-    int endparen = rand() % next->all;
-    int startquote = rand() % next->all;
-    int endquote = rand() % next->all;
-    int comma = rand() % next->all;*/
    
    bool mess = (rand() % 100) == 0;
    if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      nextToken = mstats.alternate(nextToken);
    }
    
-                if (casing < next->uppercase)
+    // Determine the casing of the next token. We randomly make the token all
-                {
+    // caps based on the markov chain. Otherwise, we check if the previous
-                        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    // token is the end of a sentence (terminating token or a wildcard query).
-                }
+    int casing = rand() % next.all;
-    
+    if (casing < next.uppercase)
-    if ((cur == newKgram) && (rand() % 15 > 0))
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype_sentence)
+          || ((cur.rbegin()->type == querytype_literal)
+            && (cur.rbegin()->word.terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
      nextToken[0] = toupper(nextToken[0]);
    }
-    /*if (startquote < next->startquote)
+    if (next.word.terminating)
-    {
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
    {
-      nextToken = "(" + nextToken;
+      std::map<int, termstats>& ending = endings[next.word];
+      int emax = ending.rbegin()->first;
+      int er = rand() % emax;
+      termstats& nextend = ending.upper_bound(er)->second;
+      
+      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
-        
-                if (period < next->period)
-                {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      int type = rand() % 6;
-    
-      if (type < 3)
-      {
-        nextToken += ".";
-      } else if (type < 5)
-      {
-        nextToken += "!";
-      } else {
-        nextToken += "?";
-      }
-                } else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      nextToken += ",";
-    }*/
                
-                /* DEBUG */
+    /* DEBUG */
-                for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
+    printKgram(cur);
-                {
-                        std::cout << *it << " ";
-                }
                
-                std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
    
    if (mess)
    {
-      std::cout << " mala " << *(next->token);
+      std::cout << " mala " << next.word.canon;
    }
    
    std::cout << std::endl;
-    
-    /*if ((cur == newKgram) || (cur == commaKgram))
+    cur.push_back(next.word);
-    {
-      cur.pop_front();
-    }
-                
-    if (period < next->period)// && ((rand() % 3) != 0))
-    {
-      cur = newKgram;
-    } else if ((comma < next->comma) && ((rand() % 3) == 0))
-    {
-      cur = commaKgram;
-    } else {*/
-      //if (mess && (rand() % 2 == 0))
-      if (false)
-      {
-        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
-        cur.clear();
-        cur.push_back(nextToken);
-      } else {
-        cur.push_back(*(next->token));
-      }
-      //}
                
-                result.push_back(nextToken);
+    result.push_back(nextToken);
-        }
+  }
        
-        return result;
+  return result;
 }
 bool removeIf(char c)
 {
-  return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));
+  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
 }
 std::string canonize(std::string f)
 {
-        std::string canonical(f);
+  std::string canonical(f);
-        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
  std::string result;
  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
        
-        return canonical;
+  return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index b01dece..ca61df7 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -7,7 +7,71 @@
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-typedef std::list<std::string> kgram;
+struct token {
+  std::string canon;
+  bool terminating;
+  
+  token(std::string canon) : canon(canon), terminating(false) {}
+  
+  bool operator<(const token& other) const
+  {
+    if (canon == other.canon)
+    {
+      return !terminating && other.terminating;
+    } else {
+      return canon < other.canon;
+    }
+  }
+};
+enum querytype {
+  querytype_literal,
+  querytype_sentence
+};
+struct query {
+  querytype type;
+  token word;
+  
+  query(token word) : word(word), type(querytype_literal) {}
+  
+  query(querytype type) : word(""), type(type) {}
+  
+  bool operator<(const query& other) const
+  {
+    if (type == other.type)
+    {
+      return word < other.word;
+    } else {
+      return type < other.type;
+    }
+  }
+};
+typedef std::list<query> kgram;
+struct termstats {
+  char terminator;
+  int occurrences;
+  
+  termstats() : terminator('.'), occurrences(1) {}
+  
+  termstats(char terminator, int occurrences)
+  {
+    this->terminator = terminator;
+    this->occurrences = occurrences;
+  }
+  
+  bool operator<(const termstats& other) const
+  {
+    if (terminator == other.terminator)
+    {
+      return occurrences < other.occurrences;
+    } else {
+      return terminator < other.terminator;
+    }
+  }
+};
 class kgramstats
 {
@@ -16,22 +80,20 @@ public:
        std::vector<std::string> randomSentence(int n);
        
 private:
-        typedef struct
+        struct token_data
        {
                int all;
                int titlecase;
                int uppercase;
-                int period;
+    token word;
-    int startquote;
+    
-    int endquote;
+    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
-    int startparen;
+        };
-    int endparen;
+  
-    int comma;
-                std::string* token;
-        } token_data;
        int maxK;
-        std::map<kgram, std::map<int, token_data*>* >* stats;
+        std::map<kgram, std::map<int, token_data> > stats;
  malaprop mstats;
+  std::map<token, std::map<int, termstats> > endings;
 };
 void printKgram(kgram k);
diff --git a/malaprop.cpp b/malaprop.cpp
index 7fbdb6c..ccdd4c4 100644
--- a/malaprop.cpp
+++ b/malaprop.cpp

@@ -117,6 +117,11 @@ std::string malaprop::alternate(std::string word)
 {
  soundex ex = soundify(word);
  std::set<std::string>& opts = dict[ex];
+  if (opts.size() == 0)
+  {
+    return word;
+  }
+  
  int opt = rand() % opts.size();
  for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++)
  {
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
commit	9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree	9afb52740fe4f618105d014a816df26b36ed83f6
parent	0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download	rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2 rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip