Added malapropisms

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2015-11-22 18:49:58 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2015-11-22 18:49:58 -0500
commit: 01746a0e03267b6c082b58436c1370567f7cb7c5 (patch)
tree: e3cfeadb97f93858f326f57958bff4675cd8f9ed
parent: 294fe00911c6ee0dd9853df7612dcdbd63425c05 (diff)
download: rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.gz
rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.bz2
rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.zip
9 files changed, 293 insertions, 117 deletions
diff --git a/Makefile.am b/Makefile.am
index 299dc10..5f6199b 100644
--- a/Makefile.am
+++ b/Makefile.am

@@ -2,7 +2,7 @@ AUTOMAKE_OPTIONS = subdir-objects
 ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
 bin_PROGRAMS = rawr-ebooks rawr-gen
-rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp
+rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp
-rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp
+rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp
 rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS)
 rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS)
 \ No newline at end of file
diff --git a/ebooks.cpp b/ebooks.cpp
index 8e46ee9..27065d9 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -12,8 +12,6 @@
 #include <yaml-cpp/yaml.h>
 #include "freevars.h"
-using namespace::std;
 int main(int argc, char** args)
 {
        srand(time(NULL));
diff --git a/freevars.cpp b/freevars.cpp
index 6472fef..8c3eda4 100644
--- a/freevars.cpp
+++ b/freevars.cpp

@@ -4,17 +4,17 @@
 freevars::freevars()
 {
-    vars = new map<string, vector<string>* >();
+    vars = new std::map<std::string, std::vector<std::string>* >();
 }
-void freevars::addVar(string name, string filename)
+void freevars::addVar(std::string name, std::string filename)
 {
-    vector<string>* eltlist = new vector<string>();
+    std::vector<std::string>* eltlist = new std::vector<std::string>();
    
-    ifstream infile(filename.c_str());
+    std::ifstream infile(filename.c_str());
    if (infile)
    {
-        string line;
+        std::string line;
        
        while (getline(infile, line))
        {
@@ -27,18 +27,18 @@ void freevars::addVar(string name, string filename)
    (*vars)[name] = eltlist;
 }
-string freevars::parse(string in)
+std::string freevars::parse(std::string in)
 {
-    string res(in);
+    std::string res(in);
    
-    for (map<string, vector<string>* >::iterator it = vars->begin(); it != vars->end(); it++)
+    for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
    {
-        string tofind = "$" + it->first + "$";
+        std::string tofind = "$" + it->first + "$";
        size_t fpos = res.find(tofind);
-        if (fpos != string::npos)
+        if (fpos != std::string::npos)
        {
            int r = rand() % it->second->size();
-            res.replace(fpos, tofind.length(), (*it->second)[r], 0, string::npos);
+            res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
        }
    }
    
diff --git a/freevars.h b/freevars.h
index 923f211..c92b9f5 100644
--- a/freevars.h
+++ b/freevars.h

@@ -2,8 +2,6 @@
 #include <string>
 #include <vector>
-using namespace::std;
 #ifndef FREEVARS_H
 #define FREEVARS_H
@@ -11,11 +9,11 @@ class freevars
 {
 public:
    freevars();
-    void addVar(string name, string filename);
+    void addVar(std::string name, std::string filename);
-    string parse(string in);
+    std::string parse(std::string in);
    
 private:
-    map<string, vector<string>* >* vars;
+    std::map<std::string, std::vector<std::string>* >* vars;
 };
 #endif
 \ No newline at end of file
diff --git a/gen.cpp b/gen.cpp
index 31ba4dc..3284ffa 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -9,65 +9,63 @@
 #include <iostream>
 #include "freevars.h"
-using namespace::std;
 int main(int argc, char** args)
 {
        srand(time(NULL));
    
    if (argc == 1)
    {
-        cout << "rawr-gen, version 1.0" << endl;
+        std::cout << "rawr-gen, version 1.0" << std::endl;
-        cout << "Usage: rawr-gen corpus-file" << endl;
+        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        cout << "  where 'corpus-file' is the path to your input" << endl;
+        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
        
        return 0;
    }
    
-        ifstream infile(args[1]);
+        std::ifstream infile(args[1]);
    if (!infile)
    {
-        cout << "rawr-gen, version 1.0" << endl;
+        std::cout << "rawr-gen, version 1.0" << std::endl;
-        cout << "Usage: rawr-gen corpus-file" << endl;
+        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        cout << "  where 'corpus-file' is the path to your input" << endl;
+        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
-        cout << endl;
+        std::cout << std::endl;
-        cout << "The file you specified does not exist." << endl;
+        std::cout << "The file you specified does not exist." << std::endl;
        
        return 0;
    }
    
-        string corpus;
+        std::string corpus;
-        string line;
+        std::string line;
        while (getline(infile, line))
        {
                corpus += " " + line;
        }
        
-    cout << "Preprocessing corpus..." << endl;
+    std::cout << "Preprocessing corpus..." << std::endl;
        kgramstats* stats = new kgramstats(corpus, 3);
    
-    cout << "Preprocessing freevars..." << endl;
+    std::cout << "Preprocessing freevars..." << std::endl;
    freevars* vars = new freevars();
    vars->addVar("name", "names.txt");
    vars->addVar("noun", "nouns.txt");
    
-    cout << "Generating..." << endl;
+    std::cout << "Generating..." << std::endl;
        for (;;)
        {
-                vector<string> doc = stats->randomSentence(rand() % 35 + 15);
+                std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
-                string hi;
+                std::string hi;
-                for (vector<string>::iterator it = doc.begin(); it != doc.end(); ++it)
+                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
                {
                        hi += vars->parse(*it) + " ";
                }
                size_t lastperiod = hi.find_last_of(".");
-                if ((lastperiod != string::npos) && (rand() % 3 > 0))
+                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
                {
                        hi = hi.substr(0, lastperiod+1);
                }
-                cout << hi << endl;
+                std::cout << hi << std::endl;
                
        getc(stdin);
        }
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b4e68eb..17598de 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -3,31 +3,35 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
+#include "malaprop.h"
+std::string canonize(std::string f);
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
-kgramstats::kgramstats(string corpus, int maxK)
+kgramstats::kgramstats(std::string corpus, int maxK)
 {
        this->maxK = maxK;
-        
+  
-        vector<string> tokens;
+  std::vector<std::string> tokens;
-    int start = 0;
+    size_t start = 0;
        int end = 0;
-        while (end != string::npos)
+        while (end != std::string::npos)
        {
           end = corpus.find(" ", start);
-       string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start);
+       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
       if (token.compare(""))
       {
+         mstats.addWord(token);
           tokens.push_back(token);
       }
-           start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
+           start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
        }
        
-        map<kgram, map<string, token_data*>* > tstats;
+        std::map<kgram, std::map<std::string, token_data*>* > tstats;
  bool newSentence = true;
  bool newClause = false;
        for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
                for (int i=0; i<(tokens.size() - k); i++)
                {
                        kgram seq(tokens.begin()+i, tokens.begin()+i+k);
-                        transform(seq.begin(), seq.end(), seq.begin(), canonize);
+                        std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
-                        string f = tokens[i+k];
+                        std::string f = tokens[i+k];
-                        string canonical = canonize(f);
+                        std::string canonical = canonize(f);
                        
                        if (tstats[seq] == NULL)
                        {
-                                tstats[seq] = new map<string, token_data*>();
+                                tstats[seq] = new std::map<std::string, token_data*>();
                        }
                        
                        if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
                        }
                        token_data* td = tstats[seq]->at(canonical);
-                        td->token = new string(canonical);
+                        td->token = new std::string(canonical);
                        td->all++;
      
      if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
        kgram newKgram(1, ".");
        if (tstats[newKgram] == NULL)
        {
-          tstats[newKgram] = new map<string, token_data*>();
+          tstats[newKgram] = new std::map<std::string, token_data*>();
        }
        
        (*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
        kgram commaKgram(1, ",");
        if (tstats[commaKgram] == NULL)
        {
-          tstats[commaKgram] = new map<string, token_data*>();
+          tstats[commaKgram] = new std::map<std::string, token_data*>();
        }
        
        (*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
                }
        }
        
-        stats = new map<kgram, map<int, token_data*>* >();
+        stats = new std::map<kgram, std::map<int, token_data*>* >();
-        for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+        for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
        {
                kgram klist = it->first;
-                map<string, token_data*>* probtable = it->second;
+                std::map<std::string, token_data*>* probtable = it->second;
-                map<int, token_data*>* distribution = new map<int, token_data*>();
+                std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
        int max = 0;
                
-                for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+                for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
                {
                        max += kt->second->all;
                        
@@ -187,17 +191,17 @@ void printKgram(kgram k)
 {
        for (kgram::iterator it = k.begin(); it != k.end(); it++)
        {
-                cout << *it << " ";
+                std::cout << *it << " ";
        }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-vector<string> kgramstats::randomSentence(int n)
+std::vector<std::string> kgramstats::randomSentence(int n)
 {
-        vector<string> result;
+        std::vector<std::string> result;
  kgram newKgram(1, ".");
  kgram commaKgram(1, ",");
-        list<string> cur = newKgram;
+        std::list<std::string> cur = newKgram;
  int cuts = 0;
        
        for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
      cuts++;
    }
-                map<int, token_data*> distribution = *(*stats)[cur];
+                std::map<int, token_data*> distribution = *(*stats)[cur];
                int max = distribution.rbegin()->first;
                int r = rand() % max;
                token_data* next = distribution.upper_bound(r)->second;
-                string nextToken(*(next->token));
+                std::string nextToken(*(next->token));
                int casing = rand() % next->all;
                int period = rand() % next->all;
    int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
    int comma = rand() % next->all;
                if (casing < next->uppercase)
                {
-                        transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+                        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
                } else if ((casing - next->uppercase) < next->titlecase)
                {
                        nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
    {
      nextToken[0] = toupper(nextToken[0]);
    }
-    /*
+    
-    if (startquote < next->startquote)
+    bool mess = (rand() % 100) == 0;
-    {
+    if (mess)
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
    {
-      nextToken = "(" + nextToken;
+      nextToken = mstats.alternate(nextToken);
-    }
-                
+      if (startquote < next->startquote)
-                if (period < next->period)
-                {
-      if (endquote < next->endquote)
      {
-        nextToken += "\"";
+        nextToken = "\"" + nextToken;
-      } else if (endparen < next->endparen)
+      } else if (startparen < next->startparen)
      {
-        nextToken += ")";
+        nextToken = "(" + nextToken;
      }
+                
+                if (period < next->period)
+                {
+        if (endquote < next->endquote)
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
      
-      int type = rand() % 6;
+        int type = rand() % 6;
      
-      if (type < 3)
+        if (type < 3)
-      {
+        {
-        nextToken += ".";
+          nextToken += ".";
-      } else if (type < 5)
+        } else if (type < 5)
-      {
+        {
-        nextToken += "!";
+          nextToken += "!";
-      } else {
+        } else {
-        nextToken += "?";
+          nextToken += "?";
-      }
+        }
-                } else if (comma < next->comma)
+                } else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
      {
-        nextToken += ")";
+        if (endquote < next->endquote)
-      }
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
      
-      nextToken += ",";
+        nextToken += ",";
+      }
    }
-*/
+    
                if (cur.size() == maxK)
                {
                        cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
                /* DEBUG */
                for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
                {
-                        cout << *it << " ";
+                        std::cout << *it << " ";
                }
                
-                cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
+                std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    
+    if (mess)
+    {
+      std::cout << " mala " << *(next->token);
+    }
+    
+    std::cout << std::endl;
    
    if ((cur == newKgram) || (cur == commaKgram))
    {
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
    {
      cur = commaKgram;
    } else {
-      cur.push_back(*(next->token));
+      //if (mess && (rand() % 2 == 0))
+      if (false)
+      {
+        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
+        cur.clear();
+        cur.push_back(nextToken);
+      } else {
+        cur.push_back(*(next->token));
+      }
    }
                
                result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
 std::string canonize(std::string f)
 {
-        string canonical(f);
+        std::string canonical(f);
-        transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  string result;
+  std::string result;
-  remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
        
        return canonical;
 }
diff --git a/kgramstats.h b/kgramstats.h
index 059eb05..b01dece 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -2,19 +2,18 @@
 #include <map>
 #include <list>
 #include <vector>
+#include "malaprop.h"
-using namespace::std;
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-typedef list<string> kgram;
+typedef std::list<std::string> kgram;
 class kgramstats
 {
 public:
-        kgramstats(string corpus, int maxK);
+        kgramstats(std::string corpus, int maxK);
-        vector<string> randomSentence(int n);
+        std::vector<std::string> randomSentence(int n);
        
 private:
        typedef struct
@@ -28,13 +27,13 @@ private:
    int startparen;
    int endparen;
    int comma;
-                string* token;
+                std::string* token;
        } token_data;
        int maxK;
-        map<kgram, map<int, token_data*>* >* stats;
+        std::map<kgram, std::map<int, token_data*>* >* stats;
+  malaprop mstats;
 };
 void printKgram(kgram k);
-std::string canonize(std::string f);
 #endif
 \ No newline at end of file
diff --git a/malaprop.cpp b/malaprop.cpp
new file mode 100644
index 0000000..bfea579
--- /dev/null
+++ b/malaprop.cpp

@@ -0,0 +1,127 @@
+#include "malaprop.h"
+#include <cstdlib>
+#include <iostream>
+bool removeIfM(char c)
+{
+  return !isalpha(c);
+}
+char soundID(char l)
+{
+  switch (l)
+  {
+    case 'b':
+    case 'f':
+    case 'p':
+    case 'v':
+      return '1';
+      
+    case 'c':
+    case 'g':
+    case 'j':
+    case 'k':
+    case 'q':
+    case 's':
+    case 'x':
+    case 'z':
+      return '2';
+      
+    case 'd':
+    case 't':
+      return '3';
+      
+    case 'l':
+      return '4';
+      
+    case 'm':
+    case 'n':
+      return '5';
+      
+    case 'r':
+      return '6';
+  }
+  
+  return l;
+}
+std::string canonizetwo(std::string f)
+{
+        std::string canonical(f);
+        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+  
+  std::string result;
+  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIfM);
+  
+  return result;
+}
+malaprop::soundex malaprop::soundify(std::string f)
+{
+        std::string result(canonizetwo(f));
+  
+  soundex ex;
+  ex.prefix = result[0];
+  
+  std::string output;
+  
+  for (int i = 1; i<result.length(); i++)
+  {
+    int c = soundID(result[i]);
+    if (
+      (isdigit(c)) // Not a vowel
+      && (c != soundID(result[i-1])) // Not the same as the previous character
+      && ((i < 2) || ((result[i-1] = 'h' || result[i-1] == 'w') && (c != soundID(result[i-2])))) // Not same as before h/w
+        )
+    {
+      output += c;
+    }
+  }
+  
+  output.resize(3, '0');
+  ex.code = atoi(output.c_str());
+        
+        return ex;
+}
+void malaprop::addWord(std::string word)
+{
+  soundex ex = soundify(word);
+  
+  dict[ex].insert(canonizetwo(word));
+}
+void malaprop::stats()
+{
+  for (std::map<soundex, std::set<std::string> >::iterator it = dict.begin(); it != dict.end(); it++)
+  {
+    printf("%c%03d (%d): ", it->first.prefix, it->first.code, it->second.size());
+    
+    for (std::set<std::string>::iterator jt = it->second.begin(); jt != it->second.end(); jt++)
+    {
+      std::cout << *jt << ", ";
+    }
+    
+    std::cout << std::endl;
+  }
+  
+  exit(0);
+}
+std::string malaprop::alternate(std::string word)
+{
+  soundex ex = soundify(word);
+  std::set<std::string>& opts = dict[ex];
+  int opt = rand() % opts.size();
+  for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++)
+  {
+    if (opt == 0)
+    {
+      return *it;
+    }
+    
+    opt--;
+  }
+  
+  return word;
+}
diff --git a/malaprop.h b/malaprop.h
new file mode 100644
index 0000000..91a18eb
--- /dev/null
+++ b/malaprop.h

@@ -0,0 +1,31 @@
+#ifndef MALAPROP_H_8F382336
+#define MALAPROP_H_8F382336
+#include <string>
+#include <map>
+#include <set>
+class malaprop
+{
+public:
+  void addWord(std::string word);
+  void stats();
+  std::string alternate(std::string word);
+  
+private:
+  struct soundex {
+    char prefix;
+    int code;
+    
+    bool operator<(const soundex& other) const
+    {
+      return (prefix < other.prefix) || (code < other.code);
+    }
+  };
+  
+  std::map<soundex, std::set<std::string> > dict;
+  
+  soundex soundify(std::string l);
+};
+#endif /* end of include guard: MALAPROP_H_8F382336 */
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2015-11-22 18:49:58 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2015-11-22 18:49:58 -0500
commit	01746a0e03267b6c082b58436c1370567f7cb7c5 (patch)
tree	e3cfeadb97f93858f326f57958bff4675cd8f9ed
parent	294fe00911c6ee0dd9853df7612dcdbd63425c05 (diff)
download	rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.gz rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.tar.bz2 rawr-ebooks-01746a0e03267b6c082b58436c1370567f7cb7c5.zip