Merge branch 'master' of https://github.com/hatkirby/rawr-ebooks

Conflicts: malaprop.cpp
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:29:12 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:29:12 -0500
commit: 53102431c2dc7266a322223f84e286a9aa7c0729 (patch)
tree: 0c9347a6c2b7b2c7d55a5f5fb681e474867046fd
parent: a28dc579f3a0cd53850d5eb10565c27a92d27c55 (diff)
parent: 9e89002477d1358de9be9cabdc1edba26bd32836 (diff)
download: rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.tar.gz
rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.tar.bz2
rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.zip
5 files changed, 439 insertions, 502 deletions
diff --git a/ebooks.cpp b/ebooks.cpp
index 27591f4..a24bd8d 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -14,174 +14,168 @@
 int main(int argc, char** args)
 {
-        srand(time(NULL));
+  srand(time(NULL));
        
-        YAML::Node config = YAML::LoadFile("config.yml");
+  YAML::Node config = YAML::LoadFile("config.yml");
-    int delay = config["delay"].as<int>();
+  int delay = config["delay"].as<int>();
-        std::ifstream infile(config["corpus"].as<std::string>().c_str());
+  std::ifstream infile(config["corpus"].as<std::string>().c_str());
-        std::string corpus;
+  std::string corpus;
-        std::string line;
+  std::string line;
-        while (getline(infile, line))
+  while (getline(infile, line))
-        {
+  {
-                corpus += line + "\n ";
+    corpus += line + "\n ";
-        }
+  }
-    std::cout << "Preprocessing corpus..." << std::endl;
+  std::cout << "Preprocessing corpus..." << std::endl;
-        kgramstats* stats = new kgramstats(corpus, 4);
+  kgramstats* stats = new kgramstats(corpus, 4);
    
-    std::cout << "Preprocessing freevars..." << std::endl;
+  std::cout << "Preprocessing freevars..." << std::endl;
-    freevars* vars = new freevars();
+  freevars* vars = new freevars();
-    vars->addVar("name", "names.txt");
+  vars->addVar("name", "names.txt");
-    vars->addVar("noun", "nouns.txt");
+  vars->addVar("noun", "nouns.txt");
-    std::cout << "Generating..." << std::endl;
+  std::cout << "Generating..." << std::endl;
-        for (;;)
+  for (;;)
-        {
+  {
-                std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
+    std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
-                std::string hi;
+    std::string hi;
-                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
+    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-                {
-                        hi += vars->parse(*it) + " ";
-                }
-    size_t firstperiod = hi.find_first_of(".!?");
-    if (firstperiod != std::string::npos)
    {
-      hi = hi.substr(firstperiod+2);
+      hi += vars->parse(*it) + " ";
    }
-    
    hi.resize(140);
-                size_t lastperiod = hi.find_last_of(".!?");
+    size_t lastperiod = hi.find_last_of(".!?,");
-                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
+    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-                {
+    {
-                        hi = hi.substr(0, lastperiod+1);
+      hi = hi.substr(0, lastperiod+1);
-                }
+    }
-            twitCurl twitterObj;
+    twitCurl twitterObj;
-            std::string tmpStr, tmpStr2;
+    std::string tmpStr, tmpStr2;
-            std::string replyMsg;
+    std::string replyMsg;
-            char tmpBuf[1024];
+    char tmpBuf[1024];
-                std::string username(config["username"].as<std::string>());
+    std::string username(config["username"].as<std::string>());
-                std::string password(config["password"].as<std::string>());
+    std::string password(config["password"].as<std::string>());
-            /* Set twitter username and password */
+    /* Set twitter username and password */
-            twitterObj.setTwitterUsername(username);
+    twitterObj.setTwitterUsername(username);
-            twitterObj.setTwitterPassword(password);
+    twitterObj.setTwitterPassword(password);
        
-            /* OAuth flow begins */
+    /* OAuth flow begins */
-            /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
+    /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
-            twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
+    twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
-            twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
+    twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
-            /* Step 1: Check if we alredy have OAuth access token from a previous run */
+    /* Step 1: Check if we alredy have OAuth access token from a previous run */
-            std::string myOAuthAccessTokenKey("");
+    std::string myOAuthAccessTokenKey("");
-            std::string myOAuthAccessTokenSecret("");
+    std::string myOAuthAccessTokenSecret("");
-            std::ifstream oAuthTokenKeyIn;
+    std::ifstream oAuthTokenKeyIn;
-            std::ifstream oAuthTokenSecretIn;
+    std::ifstream oAuthTokenSecretIn;
-            oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
+    oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
-            oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
+    oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
-            memset( tmpBuf, 0, 1024 );
+    memset( tmpBuf, 0, 1024 );
-            oAuthTokenKeyIn >> tmpBuf;
+    oAuthTokenKeyIn >> tmpBuf;
-            myOAuthAccessTokenKey = tmpBuf;
+    myOAuthAccessTokenKey = tmpBuf;
-            memset( tmpBuf, 0, 1024 );
+    memset( tmpBuf, 0, 1024 );
-            oAuthTokenSecretIn >> tmpBuf;
+    oAuthTokenSecretIn >> tmpBuf;
-            myOAuthAccessTokenSecret = tmpBuf;
+    myOAuthAccessTokenSecret = tmpBuf;
-            oAuthTokenKeyIn.close();
+    oAuthTokenKeyIn.close();
-            oAuthTokenSecretIn.close();
+    oAuthTokenSecretIn.close();
-            if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
+    if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
-            {
+    {
-                /* If we already have these keys, then no need to go through auth again */
+      /* If we already have these keys, then no need to go through auth again */
-                printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
+      printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
-                twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
+      twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
-                twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
+      twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
-            }
+    }
-            else
+    else
-            {
+    {
-                /* Step 2: Get request token key and secret */
+      /* Step 2: Get request token key and secret */
-                std::string authUrl;
+      std::string authUrl;
-                twitterObj.oAuthRequestToken( authUrl );
+      twitterObj.oAuthRequestToken( authUrl );
-                /* Step 3: Get PIN  */
+      /* Step 3: Get PIN  */
-                memset( tmpBuf, 0, 1024 );
+      memset( tmpBuf, 0, 1024 );
-                printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
+      printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
-                gets( tmpBuf );
+      gets( tmpBuf );
-                tmpStr = tmpBuf;
+      tmpStr = tmpBuf;
-                if( std::string::npos != tmpStr.find( "1" ) )
+      if( std::string::npos != tmpStr.find( "1" ) )
-                {
+      {
-                    /* Ask user to visit twitter.com auth page and get PIN */
+        /* Ask user to visit twitter.com auth page and get PIN */
-                    memset( tmpBuf, 0, 1024 );
+        memset( tmpBuf, 0, 1024 );
-                    printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
+        printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
-                    printf( "\nEnter the PIN provided by twitter: " );
+        printf( "\nEnter the PIN provided by twitter: " );
-                    gets( tmpBuf );
+        gets( tmpBuf );
-                    tmpStr = tmpBuf;
+        tmpStr = tmpBuf;
-                    twitterObj.getOAuth().setOAuthPin( tmpStr );
+        twitterObj.getOAuth().setOAuthPin( tmpStr );
-                }
+      }
-                else
+      else
-                {
+      {
-                    /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
+        /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
-                    twitterObj.oAuthHandlePIN( authUrl );
+        twitterObj.oAuthHandlePIN( authUrl );
-                }
+      }
-                /* Step 4: Exchange request token with access token */
+      /* Step 4: Exchange request token with access token */
-                twitterObj.oAuthAccessToken();
+      twitterObj.oAuthAccessToken();
-                /* Step 5: Now, save this access token key and secret for future use without PIN */
+      /* Step 5: Now, save this access token key and secret for future use without PIN */
-                twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
+      twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
-                twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
+      twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
-                /* Step 6: Save these keys in a file or wherever */
+      /* Step 6: Save these keys in a file or wherever */
-                std::ofstream oAuthTokenKeyOut;
+      std::ofstream oAuthTokenKeyOut;
-                std::ofstream oAuthTokenSecretOut;
+      std::ofstream oAuthTokenSecretOut;
-                oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
+      oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
-                oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
+      oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
-                oAuthTokenKeyOut.clear();
+      oAuthTokenKeyOut.clear();
-                oAuthTokenSecretOut.clear();
+      oAuthTokenSecretOut.clear();
-                oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
+      oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
-                oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
+      oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
-                oAuthTokenKeyOut.close();
+      oAuthTokenKeyOut.close();
-                oAuthTokenSecretOut.close();
+      oAuthTokenSecretOut.close();
-            }
+    }
-            /* OAuth flow ends */
+    /* OAuth flow ends */
-            /* Account credentials verification */
+    /* Account credentials verification */
-            if( twitterObj.accountVerifyCredGet() )
+    if( twitterObj.accountVerifyCredGet() )
-            {
+    {
-                twitterObj.getLastWebResponse( replyMsg );
+      twitterObj.getLastWebResponse( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
-            }
+    }
-            else
+    else
-            {
+    {
-                twitterObj.getLastCurlError( replyMsg );
+      twitterObj.getLastCurlError( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
-            }
+    }
        
-            /* Post a new status message */
+    /* Post a new status message */
-            replyMsg = "";
+    replyMsg = "";
-            if( twitterObj.statusUpdate( hi ) )
+    if( twitterObj.statusUpdate( hi ) )
-            {
+    {
-                twitterObj.getLastWebResponse( replyMsg );
+      twitterObj.getLastWebResponse( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
-            }
+    }
-            else
+    else
-            {
+    {
-                twitterObj.getLastCurlError( replyMsg );
+      twitterObj.getLastCurlError( replyMsg );
-                printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
+      printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
-            }
+    }
-                sleep(rand() % delay);
+    sleep(rand() % delay);
-        }
+  }
        
-        return 0;
+  return 0;
 }
diff --git a/gen.cpp b/gen.cpp
index 7e47d45..400c0a5 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -11,72 +11,66 @@
 int main(int argc, char** args)
 {
-        srand(time(NULL));
+  srand(time(NULL));
    
-    if (argc == 1)
+  if (argc == 1)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::ifstream infile(args[1]);
+  std::ifstream infile(args[1]);
-    if (!infile)
+  if (!infile)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
-        std::cout << std::endl;
+    std::cout << std::endl;
-        std::cout << "The file you specified does not exist." << std::endl;
+    std::cout << "The file you specified does not exist." << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::string corpus;
+  std::string corpus;
-        std::string line;
+  std::string line;
-        while (getline(infile, line))
+  while (getline(infile, line))
-        {
+  {
-                corpus += line + "\n ";
+    corpus += line + "\n ";
-        }
+  }
        
-    std::cout << "Preprocessing corpus..." << std::endl;
+  std::cout << "Preprocessing corpus..." << std::endl;
-        kgramstats* stats = new kgramstats(corpus, 4);
+  kgramstats* stats = new kgramstats(corpus, 4);
    
-    std::cout << "Preprocessing freevars..." << std::endl;
+  std::cout << "Preprocessing freevars..." << std::endl;
-    freevars* vars = new freevars();
+  freevars* vars = new freevars();
-    vars->addVar("name", "names.txt");
+  vars->addVar("name", "names.txt");
-    vars->addVar("noun", "nouns.txt");
+  vars->addVar("noun", "nouns.txt");
    
-    std::cout << "Generating..." << std::endl;
+  std::cout << "Generating..." << std::endl;
-        for (;;)
+  for (;;)
-        {
+  {
-                std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45);
+    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
-                std::string hi;
+    std::string hi;
-                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
+    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-                {
-                        hi += vars->parse(*it) + " ";
-                }
-    
-    size_t firstperiod = hi.find_first_of(".!?");
-    if (firstperiod != std::string::npos)
    {
-      hi = hi.substr(firstperiod+2);
+      hi += vars->parse(*it) + " ";
    }
    
    hi.resize(140);
-                size_t lastperiod = hi.find_last_of(".!?");
+    size_t lastperiod = hi.find_last_of(".!?,");
-                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
+    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-                {
+    {
-                        hi = hi.substr(0, lastperiod+1);
+      hi = hi.substr(0, lastperiod+1);
-                }
+    }
-                std::cout << hi << std::endl;
+    std::cout << hi << std::endl;
                
-        getc(stdin);
+    getc(stdin);
-        }
+  }
        
-        return 0;
+  return 0;
 }
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b0ec68a..c88d83c 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -5,237 +5,176 @@
 #include <algorithm>
 #include "malaprop.h"
+query wildcardQuery(querytype_sentence);
 std::string canonize(std::string f);
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
 kgramstats::kgramstats(std::string corpus, int maxK)
 {
-        this->maxK = maxK;
+  this->maxK = maxK;
  
  std::vector<std::string> tokens;
-    size_t start = 0;
+  size_t start = 0;
-        int end = 0;
+  int end = 0;
-        while (end != std::string::npos)
+  while (end != std::string::npos)
-        {
+  {
-           end = corpus.find(" ", start);
+    end = corpus.find(" ", start);
-       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-       if (token[token.length()-1] == '\n')
+    if (token[token.length()-1] == '\n')
-       {
+    {
-         if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))
+      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
-         {
+      {
-           token.insert(token.length()-1, ".");
+        token.insert(token.length()-1, ".");
-         }
+      }
         
-         token.resize(token.length()-1);
+      token.resize(token.length()-1);
-       }
+    }
       
-       if (token.compare("") && token.compare("."))
+    if (token.compare("") && token.compare("."))
-       {
+    {
-         mstats.addWord(token);
+      mstats.addWord(token);
-           tokens.push_back(token);
+      tokens.push_back(token);
-       }
+    }
-           start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
-        }
+  }
        
-        std::map<kgram, std::map<std::string, token_data*>* > tstats;
+  std::map<kgram, std::map<token, token_data> > tstats;
-  bool newSentence = true;
+  std::map<token, std::map<termstats, int> > tendings;
-  bool newClause = false;
+  for (int k=1; k<maxK; k++)
-        for (int k=0; k<maxK; k++)
+  {
-        {
+    for (int i=0; i<(tokens.size() - k); i++)
-                for (int i=0; i<(tokens.size() - k); i++)
+    {
-                {
+      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
-                        kgram seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix;
-                        std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
-                        std::string f = tokens[i+k];
-                        
-      
-      
-      std::string canonical = canonize(f);
-                        
-                        if (tstats[seq] == NULL)
-                        {
-                                tstats[seq] = new std::map<std::string, token_data*>();
-                        }
-                        
-                        if ((*tstats[seq])[canonical] == NULL)
-                        {
-                                (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
-                        }
-                        token_data* td = tstats[seq]->at(canonical);
-                        td->token = new std::string(canonical);
-                        td->all++;
      
-      /*if (newSentence)
+      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
      {
-        kgram newKgram(1, ".");
+        token word(canonize(*it));
-        if (tstats[newKgram] == NULL)
+        
+        if (it->find_first_of(".?!,") != std::string::npos)
        {
-          tstats[newKgram] = new std::map<std::string, token_data*>();
+          word.terminating = true;
        }
        
-        (*tstats[newKgram])[canonical] = td;
+        prefix.push_back(word);
-        
-        newSentence = false;
      }
      
-      if (newClause)
+      std::string f = tokens[i+k];
+                  std::string canonical = canonize(f);
+      
+      token word(canonical);
+      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        kgram commaKgram(1, ",");
+        word.terminating = true;
-        if (tstats[commaKgram] == NULL)
-        {
-          tstats[commaKgram] = new std::map<std::string, token_data*>();
-        }
        
-        (*tstats[commaKgram])[canonical] = td;
+        char terminator = f[f.find_last_of(".?!,")];
+        int occurrences = std::count(f.begin(), f.end(), terminator);
        
-        newClause = false;
+        tendings[word][termstats(terminator, occurrences)]++;
-      }
-      
-      if ((f.length() > 0) && (f[f.length()-1] == '\n'))
-      {
-        td->period++;
-        newSentence = true;
-        f.resize(f.length()-1);
      }
                        
-      if (f.length() > 0)
+      token_data& td = tstats[prefix][word];
+      td.word = word;
+      td.all++;
+      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
      {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
+        td.uppercase++;
-                        {
+      } else if (isupper(f[0]))
-          if (!newSentence)
+      {
-          {
+        td.titlecase++;
-                                td->period++;
-            newSentence = true;
-          }
-                                
-          f.resize(f.length()-1);
-                        } else if (f[f.length()-1] == ',')
-        {
-          if (!newSentence)
-          {
-            td->comma++;
-            newClause = true;
-          }
-          
-          f.resize(f.length()-1);
-        }
      }
      
-      if (f.length() > 0)
+      if (prefix.front().word.terminating)
      {
-        if (f[0] == '"')
+        prefix.front() = wildcardQuery;
-        {
-          td->startquote++;
-        }
        
-        if (f[0] == '(')
+        token_data& td2 = tstats[prefix][word];
+        td2.word = word;
+        td2.all++;
+        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
        {
-          td->startparen++;
+          td2.uppercase++;
-        }
+        } else if (isupper(f[0]))
-        
-        if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
        {
-          if (f[f.length()-1] == '"')
+          td2.titlecase++;
-          {
-            td->endquote++;
-          } else if (f[f.length()-1] == ')')
-          {
-            td->endparen++;
-          }
-          
-          f.resize(f.length()-1);
-          
-          if (f.length() > 0)
-          {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
-                        {
-              if (!newSentence)
-              {
-                                        td->period++;
-                newSentence = true;
-              }
-                        } else if (f[f.length()-1] == ',')
-            {
-              if (!newSentence && !newClause)
-              {
-                td->comma++;
-                newClause = true;
-              }
-            }
-          }
-        }
-      }*/
-                        
-                        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
-                        {
-                                td->uppercase++;
-                        } else if (isupper(f[0]))
-                        {
-                                td->titlecase++;
-                        }
-      
-      /*if (k != 0)
-      {
-        if (newSentence)
-        {
-          i += k;
        }
-        
+      }
-        newSentence = false;
+    }
-        newClause = false;
+  }
-      }*/
-                }
-        }
        
-        stats = new std::map<kgram, std::map<int, token_data*>* >();
+  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
-        for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  {
-        {
+    kgram klist = it->first;
-                kgram klist = it->first;
+    std::map<token, token_data>& probtable = it->second;
-                std::map<std::string, token_data*>* probtable = it->second;
+    std::map<int, token_data>& distribution = stats[klist];
-                std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
+    int max = 0;
-        int max = 0;
                
-                for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-                {
+    {
-                        max += kt->second->all;
+      max += kt->second.all;
                        
-                        (*distribution)[max] = kt->second;
+      distribution[max] = kt->second;
-                }
+    }
-                
+  }
-                (*stats)[klist] = distribution;
+  
-        }
+  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
+  {
+    token word = it->first;
+    std::map<termstats, int>& probtable = it->second;
+    std::map<int, termstats>& distribution = endings[word];
+    int max = 0;
+    
+    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    {
+      max += kt->second;
+      
+      distribution[max] = kt->first;
+    }
+  }
 }
 void printKgram(kgram k)
 {
-        for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (kgram::iterator it = k.begin(); it != k.end(); it++)
-        {
+  {
-                std::cout << *it << " ";
+    query& q = *it;
-        }
+    if (q.type == querytype_sentence)
+    {
+      std::cout << "#.# ";
+    } else if (q.type == querytype_literal)
+    {
+      if (q.word.terminating)
+      {
+        std::cout << q.word.canon << ". ";
+      } else {
+        std::cout << q.word.canon << " ";
+      }
+    }
+  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
 std::vector<std::string> kgramstats::randomSentence(int n)
 {
-        std::vector<std::string> result;
+  std::vector<std::string> result;
-  kgram newKgram(1, ".");
+  kgram cur(1, wildcardQuery);
-  kgram commaKgram(1, ",");
-        std::list<std::string> cur;
  int cuts = 0;
        
-        for (int i=0; i<n; i++)
+  for (int i=0; i<n; i++)
-        {
+  {
-                if (cur.size() == maxK)
+    if (cur.size() == maxK)
-                {
+    {
-                        cur.pop_front();
+      cur.pop_front();
-                }
+    }
    
-    if ((cur.size() > 0) && (cur != newKgram))
+    if (cur.size() > 0)
    {
      if (rand() % (maxK - cur.size() + 1) == 0)
      {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      
      cuts++;
    }
+    
+    // Gotta circumvent the last line of the input corpus
+    // https://twitter.com/starla4444/status/684222271339237376
+    if (stats.count(cur) == 0)
+    {
+      cur = kgram(1, wildcardQuery);
+    }
-                std::map<int, token_data*> distribution = *(*stats)[cur];
+    std::map<int, token_data>& distribution = stats[cur];
-                int max = distribution.rbegin()->first;
+    int max = distribution.rbegin()->first;
-                int r = rand() % max;
+    int r = rand() % max;
-                token_data* next = distribution.upper_bound(r)->second;
+    token_data& next = distribution.upper_bound(r)->second;
+    std::string nextToken(next.word.canon);
-                std::string nextToken(*(next->token));
-                int casing = rand() % next->all;
-                /*int period = rand() % next->all;
-    int startparen = rand() % next->all;
-    int endparen = rand() % next->all;
-    int startquote = rand() % next->all;
-    int endquote = rand() % next->all;
-    int comma = rand() % next->all;*/
    
    bool mess = (rand() % 100) == 0;
    if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      nextToken = mstats.alternate(nextToken);
    }
    
-                if (casing < next->uppercase)
+    // Determine the casing of the next token. We randomly make the token all
-                {
+    // caps based on the markov chain. Otherwise, we check if the previous
-                        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    // token is the end of a sentence (terminating token or a wildcard query).
-                }
+    int casing = rand() % next.all;
-    
+    if (casing < next.uppercase)
-    if ((cur == newKgram) && (rand() % 15 > 0))
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype_sentence)
+          || ((cur.rbegin()->type == querytype_literal)
+            && (cur.rbegin()->word.terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
      nextToken[0] = toupper(nextToken[0]);
    }
-    /*if (startquote < next->startquote)
+    if (next.word.terminating)
-    {
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
    {
-      nextToken = "(" + nextToken;
+      std::map<int, termstats>& ending = endings[next.word];
+      int emax = ending.rbegin()->first;
+      int er = rand() % emax;
+      termstats& nextend = ending.upper_bound(er)->second;
+      
+      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
-        
-                if (period < next->period)
-                {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      int type = rand() % 6;
-    
-      if (type < 3)
-      {
-        nextToken += ".";
-      } else if (type < 5)
-      {
-        nextToken += "!";
-      } else {
-        nextToken += "?";
-      }
-                } else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      nextToken += ",";
-    }*/
                
-                /* DEBUG */
+    /* DEBUG */
-                for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
+    printKgram(cur);
-                {
-                        std::cout << *it << " ";
-                }
                
-                std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
    
    if (mess)
    {
-      std::cout << " mala " << *(next->token);
+      std::cout << " mala " << next.word.canon;
    }
    
    std::cout << std::endl;
-    
-    /*if ((cur == newKgram) || (cur == commaKgram))
+    cur.push_back(next.word);
-    {
-      cur.pop_front();
-    }
-                
-    if (period < next->period)// && ((rand() % 3) != 0))
-    {
-      cur = newKgram;
-    } else if ((comma < next->comma) && ((rand() % 3) == 0))
-    {
-      cur = commaKgram;
-    } else {*/
-      //if (mess && (rand() % 2 == 0))
-      if (false)
-      {
-        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
-        cur.clear();
-        cur.push_back(nextToken);
-      } else {
-        cur.push_back(*(next->token));
-      }
-      //}
                
-                result.push_back(nextToken);
+    result.push_back(nextToken);
-        }
+  }
        
-        return result;
+  return result;
 }
 bool removeIf(char c)
 {
-  return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));
+  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
 }
 std::string canonize(std::string f)
 {
-        std::string canonical(f);
+  std::string canonical(f);
-        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
  std::string result;
  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
        
-        return canonical;
+  return result;
 }
diff --git a/kgramstats.h b/kgramstats.h
index b01dece..ca61df7 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -7,7 +7,71 @@
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-typedef std::list<std::string> kgram;
+struct token {
+  std::string canon;
+  bool terminating;
+  
+  token(std::string canon) : canon(canon), terminating(false) {}
+  
+  bool operator<(const token& other) const
+  {
+    if (canon == other.canon)
+    {
+      return !terminating && other.terminating;
+    } else {
+      return canon < other.canon;
+    }
+  }
+};
+enum querytype {
+  querytype_literal,
+  querytype_sentence
+};
+struct query {
+  querytype type;
+  token word;
+  
+  query(token word) : word(word), type(querytype_literal) {}
+  
+  query(querytype type) : word(""), type(type) {}
+  
+  bool operator<(const query& other) const
+  {
+    if (type == other.type)
+    {
+      return word < other.word;
+    } else {
+      return type < other.type;
+    }
+  }
+};
+typedef std::list<query> kgram;
+struct termstats {
+  char terminator;
+  int occurrences;
+  
+  termstats() : terminator('.'), occurrences(1) {}
+  
+  termstats(char terminator, int occurrences)
+  {
+    this->terminator = terminator;
+    this->occurrences = occurrences;
+  }
+  
+  bool operator<(const termstats& other) const
+  {
+    if (terminator == other.terminator)
+    {
+      return occurrences < other.occurrences;
+    } else {
+      return terminator < other.terminator;
+    }
+  }
+};
 class kgramstats
 {
@@ -16,22 +80,20 @@ public:
        std::vector<std::string> randomSentence(int n);
        
 private:
-        typedef struct
+        struct token_data
        {
                int all;
                int titlecase;
                int uppercase;
-                int period;
+    token word;
-    int startquote;
+    
-    int endquote;
+    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
-    int startparen;
+        };
-    int endparen;
+  
-    int comma;
-                std::string* token;
-        } token_data;
        int maxK;
-        std::map<kgram, std::map<int, token_data*>* >* stats;
+        std::map<kgram, std::map<int, token_data> > stats;
  malaprop mstats;
+  std::map<token, std::map<int, termstats> > endings;
 };
 void printKgram(kgram k);
diff --git a/malaprop.cpp b/malaprop.cpp
index c308d34..ff0cb5d 100644
--- a/malaprop.cpp
+++ b/malaprop.cpp

@@ -119,7 +119,6 @@ std::string malaprop::alternate(std::string word)
  std::set<std::string>& opts = dict[ex];
  if (opts.size() == 0)
  {
-    // Not sure why this can even happen but it is happenining????
    return word;
  }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:29:12 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:29:12 -0500
commit	53102431c2dc7266a322223f84e286a9aa7c0729 (patch)
tree	0c9347a6c2b7b2c7d55a5f5fb681e474867046fd
parent	a28dc579f3a0cd53850d5eb10565c27a92d27c55 (diff)
parent	9e89002477d1358de9be9cabdc1edba26bd32836 (diff)
download	rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.tar.gz rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.tar.bz2 rawr-ebooks-53102431c2dc7266a322223f84e286a9aa7c0729.zip