From 01746a0e03267b6c082b58436c1370567f7cb7c5 Mon Sep 17 00:00:00 2001
From: Kelly Rauchenberger <fefferburbia@gmail.com>
Date: Sun, 22 Nov 2015 18:49:58 -0500
Subject: Added malapropisms

---
 Makefile.am    |   4 +-
 ebooks.cpp     |   2 -
 freevars.cpp   |  22 ++++----
 freevars.h     |   8 ++-
 gen.cpp        |  40 +++++++-------
 kgramstats.cpp | 161 +++++++++++++++++++++++++++++++++------------------------
 kgramstats.h   |  15 +++---
 malaprop.cpp   | 127 +++++++++++++++++++++++++++++++++++++++++++++
 malaprop.h     |  31 +++++++++++
 9 files changed, 293 insertions(+), 117 deletions(-)
 create mode 100644 malaprop.cpp
 create mode 100644 malaprop.h
diff --git a/Makefile.am b/Makefile.am
index 299dc10..5f6199b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -2,7 +2,7 @@ AUTOMAKE_OPTIONS = subdir-objects
 ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
 
 bin_PROGRAMS = rawr-ebooks rawr-gen
-rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp
-rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp
+rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp
+rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp
 rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS)
 rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS)
\ No newline at end of file
diff --git a/ebooks.cpp b/ebooks.cpp
index 8e46ee9..27065d9 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp
@@ -12,8 +12,6 @@
 #include <yaml-cpp/yaml.h>
 #include "freevars.h"
 
-using namespace::std;
-
 int main(int argc, char** args)
 {
 	srand(time(NULL));
diff --git a/freevars.cpp b/freevars.cpp
index 6472fef..8c3eda4 100644
--- a/freevars.cpp
+++ b/freevars.cpp
@@ -4,17 +4,17 @@
 
 freevars::freevars()
 {
-    vars = new map<string, vector<string>* >();
+    vars = new std::map<std::string, std::vector<std::string>* >();
 }
 
-void freevars::addVar(string name, string filename)
+void freevars::addVar(std::string name, std::string filename)
 {
-    vector<string>* eltlist = new vector<string>();
+    std::vector<std::string>* eltlist = new std::vector<std::string>();
     
-    ifstream infile(filename.c_str());
+    std::ifstream infile(filename.c_str());
     if (infile)
     {
-        string line;
+        std::string line;
         
         while (getline(infile, line))
         {
@@ -27,18 +27,18 @@ void freevars::addVar(string name, string filename)
     (*vars)[name] = eltlist;
 }
 
-string freevars::parse(string in)
+std::string freevars::parse(std::string in)
 {
-    string res(in);
+    std::string res(in);
     
-    for (map<string, vector<string>* >::iterator it = vars->begin(); it != vars->end(); it++)
+    for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
     {
-        string tofind = "$" + it->first + "$";
+        std::string tofind = "$" + it->first + "$";
         size_t fpos = res.find(tofind);
-        if (fpos != string::npos)
+        if (fpos != std::string::npos)
         {
             int r = rand() % it->second->size();
-            res.replace(fpos, tofind.length(), (*it->second)[r], 0, string::npos);
+            res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
         }
     }
     
diff --git a/freevars.h b/freevars.h
index 923f211..c92b9f5 100644
--- a/freevars.h
+++ b/freevars.h
@@ -2,8 +2,6 @@
 #include <string>
 #include <vector>
 
-using namespace::std;
-
 #ifndef FREEVARS_H
 #define FREEVARS_H
 
@@ -11,11 +9,11 @@ class freevars
 {
 public:
     freevars();
-    void addVar(string name, string filename);
-    string parse(string in);
+    void addVar(std::string name, std::string filename);
+    std::string parse(std::string in);
     
 private:
-    map<string, vector<string>* >* vars;
+    std::map<std::string, std::vector<std::string>* >* vars;
 };
 
 #endif
\ No newline at end of file
diff --git a/gen.cpp b/gen.cpp
index 31ba4dc..3284ffa 100644
--- a/gen.cpp
+++ b/gen.cpp
@@ -9,65 +9,63 @@
 #include <iostream>
 #include "freevars.h"
 
-using namespace::std;
-
 int main(int argc, char** args)
 {
 	srand(time(NULL));
     
     if (argc == 1)
     {
-        cout << "rawr-gen, version 1.0" << endl;
-        cout << "Usage: rawr-gen corpus-file" << endl;
-        cout << "  where 'corpus-file' is the path to your input" << endl;
+        std::cout << "rawr-gen, version 1.0" << std::endl;
+        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
         
         return 0;
     }
     
-	ifstream infile(args[1]);
+	std::ifstream infile(args[1]);
     if (!infile)
     {
-        cout << "rawr-gen, version 1.0" << endl;
-        cout << "Usage: rawr-gen corpus-file" << endl;
-        cout << "  where 'corpus-file' is the path to your input" << endl;
-        cout << endl;
-        cout << "The file you specified does not exist." << endl;
+        std::cout << "rawr-gen, version 1.0" << std::endl;
+        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+        std::cout << std::endl;
+        std::cout << "The file you specified does not exist." << std::endl;
         
         return 0;
     }
     
-	string corpus;
-	string line;
+	std::string corpus;
+	std::string line;
 	while (getline(infile, line))
 	{
 		corpus += " " + line;
 	}
 	
-    cout << "Preprocessing corpus..." << endl;
+    std::cout << "Preprocessing corpus..." << std::endl;
 	kgramstats* stats = new kgramstats(corpus, 3);
     
-    cout << "Preprocessing freevars..." << endl;
+    std::cout << "Preprocessing freevars..." << std::endl;
     freevars* vars = new freevars();
     vars->addVar("name", "names.txt");
     vars->addVar("noun", "nouns.txt");
     
-    cout << "Generating..." << endl;
+    std::cout << "Generating..." << std::endl;
 	for (;;)
 	{
-		vector<string> doc = stats->randomSentence(rand() % 35 + 15);
-		string hi;
-		for (vector<string>::iterator it = doc.begin(); it != doc.end(); ++it)
+		std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
+		std::string hi;
+		for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
 		{
 			hi += vars->parse(*it) + " ";
 		}
 
 		size_t lastperiod = hi.find_last_of(".");
-		if ((lastperiod != string::npos) && (rand() % 3 > 0))
+		if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
 		{
 			hi = hi.substr(0, lastperiod+1);
 		}
 
-		cout << hi << endl;
+		std::cout << hi << std::endl;
 		
         getc(stdin);
 	}
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b4e68eb..17598de 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp
@@ -3,31 +3,35 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
+#include "malaprop.h"
+
+std::string canonize(std::string f);
 
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
-kgramstats::kgramstats(string corpus, int maxK)
+kgramstats::kgramstats(std::string corpus, int maxK)
 {
 	this->maxK = maxK;
-	
-	vector<string> tokens;
-    int start = 0;
+  
+  std::vector<std::string> tokens;
+    size_t start = 0;
 	int end = 0;
 
-	while (end != string::npos)
+	while (end != std::string::npos)
 	{
 	   end = corpus.find(" ", start);
 
-       string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start);
+       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
        if (token.compare(""))
        {
+         mstats.addWord(token);
            tokens.push_back(token);
        }
 
-	   start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
+	   start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
 	}
 	
-	map<kgram, map<string, token_data*>* > tstats;
+	std::map<kgram, std::map<std::string, token_data*>* > tstats;
   bool newSentence = true;
   bool newClause = false;
 	for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
 		for (int i=0; i<(tokens.size() - k); i++)
 		{
 			kgram seq(tokens.begin()+i, tokens.begin()+i+k);
-			transform(seq.begin(), seq.end(), seq.begin(), canonize);
-			string f = tokens[i+k];
-			string canonical = canonize(f);
+			std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
+			std::string f = tokens[i+k];
+			std::string canonical = canonize(f);
 			
 			if (tstats[seq] == NULL)
 			{
-				tstats[seq] = new map<string, token_data*>();
+				tstats[seq] = new std::map<std::string, token_data*>();
 			}
 			
 			if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
 			}
 
 			token_data* td = tstats[seq]->at(canonical);
-			td->token = new string(canonical);
+			td->token = new std::string(canonical);
 			td->all++;
       
       if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
         kgram newKgram(1, ".");
         if (tstats[newKgram] == NULL)
         {
-          tstats[newKgram] = new map<string, token_data*>();
+          tstats[newKgram] = new std::map<std::string, token_data*>();
         }
         
         (*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
         kgram commaKgram(1, ",");
         if (tstats[commaKgram] == NULL)
         {
-          tstats[commaKgram] = new map<string, token_data*>();
+          tstats[commaKgram] = new std::map<std::string, token_data*>();
         }
         
         (*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
 		}
 	}
 	
-	stats = new map<kgram, map<int, token_data*>* >();
-	for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+	stats = new std::map<kgram, std::map<int, token_data*>* >();
+	for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
 	{
 		kgram klist = it->first;
-		map<string, token_data*>* probtable = it->second;
-		map<int, token_data*>* distribution = new map<int, token_data*>();
+		std::map<std::string, token_data*>* probtable = it->second;
+		std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
         int max = 0;
 		
-		for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+		for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
 		{
 			max += kt->second->all;
 			
@@ -187,17 +191,17 @@ void printKgram(kgram k)
 {
 	for (kgram::iterator it = k.begin(); it != k.end(); it++)
 	{
-		cout << *it << " ";
+		std::cout << *it << " ";
 	}
 }
 
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-vector<string> kgramstats::randomSentence(int n)
+std::vector<std::string> kgramstats::randomSentence(int n)
 {
-	vector<string> result;
+	std::vector<std::string> result;
   kgram newKgram(1, ".");
   kgram commaKgram(1, ",");
-	list<string> cur = newKgram;
+	std::list<std::string> cur = newKgram;
   int cuts = 0;
 	
 	for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
       cuts++;
     }
 
-		map<int, token_data*> distribution = *(*stats)[cur];
+		std::map<int, token_data*> distribution = *(*stats)[cur];
 		int max = distribution.rbegin()->first;
 		int r = rand() % max;
 		token_data* next = distribution.upper_bound(r)->second;
 
-		string nextToken(*(next->token));
+		std::string nextToken(*(next->token));
 		int casing = rand() % next->all;
 		int period = rand() % next->all;
     int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
     int comma = rand() % next->all;
 		if (casing < next->uppercase)
 		{
-			transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+			std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
 		} else if ((casing - next->uppercase) < next->titlecase)
 		{
 			nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
     {
       nextToken[0] = toupper(nextToken[0]);
     }
-    /*
-    if (startquote < next->startquote)
-    {
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
+    
+    bool mess = (rand() % 100) == 0;
+    if (mess)
     {
-      nextToken = "(" + nextToken;
-    }
-		
-		if (period < next->period)
-		{
-      if (endquote < next->endquote)
+      nextToken = mstats.alternate(nextToken);
+
+      if (startquote < next->startquote)
       {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
+        nextToken = "\"" + nextToken;
+      } else if (startparen < next->startparen)
       {
-        nextToken += ")";
+        nextToken = "(" + nextToken;
       }
+		
+  		if (period < next->period)
+  		{
+        if (endquote < next->endquote)
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
       
-      int type = rand() % 6;
+        int type = rand() % 6;
       
-      if (type < 3)
-      {
-        nextToken += ".";
-      } else if (type < 5)
-      {
-        nextToken += "!";
-      } else {
-        nextToken += "?";
-      }
-		} else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
+        if (type < 3)
+        {
+          nextToken += ".";
+        } else if (type < 5)
+        {
+          nextToken += "!";
+        } else {
+          nextToken += "?";
+        }
+  		} else if (comma < next->comma)
       {
-        nextToken += ")";
-      }
+        if (endquote < next->endquote)
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
       
-      nextToken += ",";
+        nextToken += ",";
+      }
     }
-*/
+    
 		if (cur.size() == maxK)
 		{
 			cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
 		/* DEBUG */
 		for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
 		{
-			cout << *it << " ";
+			std::cout << *it << " ";
 		}
 		
-		cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
+		std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    
+    if (mess)
+    {
+      std::cout << " mala " << *(next->token);
+    }
+    
+    std::cout << std::endl;
     
     if ((cur == newKgram) || (cur == commaKgram))
     {
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
     {
       cur = commaKgram;
     } else {
-      cur.push_back(*(next->token));
+      //if (mess && (rand() % 2 == 0))
+      if (false)
+      {
+        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
+        cur.clear();
+        cur.push_back(nextToken);
+      } else {
+        cur.push_back(*(next->token));
+      }
     }
 		
 		result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
 
 std::string canonize(std::string f)
 {
-	string canonical(f);
-	transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+	std::string canonical(f);
+	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
   
-  string result;
-  remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  std::string result;
+  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
 	
 	return canonical;
 }
diff --git a/kgramstats.h b/kgramstats.h
index 059eb05..b01dece 100644
--- a/kgramstats.h
+++ b/kgramstats.h
@@ -2,19 +2,18 @@
 #include <map>
 #include <list>
 #include <vector>
-
-using namespace::std;
+#include "malaprop.h"
 
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
 
-typedef list<string> kgram;
+typedef std::list<std::string> kgram;
 
 class kgramstats
 {
 public:
-	kgramstats(string corpus, int maxK);
-	vector<string> randomSentence(int n);
+	kgramstats(std::string corpus, int maxK);
+	std::vector<std::string> randomSentence(int n);
 	
 private:
 	typedef struct
@@ -28,13 +27,13 @@ private:
     int startparen;
     int endparen;
     int comma;
-		string* token;
+		std::string* token;
 	} token_data;
 	int maxK;
-	map<kgram, map<int, token_data*>* >* stats;
+	std::map<kgram, std::map<int, token_data*>* >* stats;
+  malaprop mstats;
 };
 
 void printKgram(kgram k);
-std::string canonize(std::string f);
 
 #endif
\ No newline at end of file
diff --git a/malaprop.cpp b/malaprop.cpp
new file mode 100644
index 0000000..bfea579
--- /dev/null
+++ b/malaprop.cpp
@@ -0,0 +1,127 @@
+#include "malaprop.h"
+#include <cstdlib>
+#include <iostream>
+
+bool removeIfM(char c)
+{
+  return !isalpha(c);
+}
+
+char soundID(char l)
+{
+  switch (l)
+  {
+    case 'b':
+    case 'f':
+    case 'p':
+    case 'v':
+      return '1';
+      
+    case 'c':
+    case 'g':
+    case 'j':
+    case 'k':
+    case 'q':
+    case 's':
+    case 'x':
+    case 'z':
+      return '2';
+      
+    case 'd':
+    case 't':
+      return '3';
+      
+    case 'l':
+      return '4';
+      
+    case 'm':
+    case 'n':
+      return '5';
+      
+    case 'r':
+      return '6';
+  }
+  
+  return l;
+}
+
+std::string canonizetwo(std::string f)
+{
+	std::string canonical(f);
+	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+  
+  std::string result;
+  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIfM);
+  
+  return result;
+}
+
+malaprop::soundex malaprop::soundify(std::string f)
+{
+	std::string result(canonizetwo(f));
+  
+  soundex ex;
+  ex.prefix = result[0];
+  
+  std::string output;
+  
+  for (int i = 1; i<result.length(); i++)
+  {
+    int c = soundID(result[i]);
+    if (
+      (isdigit(c)) // Not a vowel
+      && (c != soundID(result[i-1])) // Not the same as the previous character
+      && ((i < 2) || ((result[i-1] = 'h' || result[i-1] == 'w') && (c != soundID(result[i-2])))) // Not same as before h/w
+        )
+    {
+      output += c;
+    }
+  }
+  
+  output.resize(3, '0');
+  ex.code = atoi(output.c_str());
+	
+	return ex;
+}
+
+void malaprop::addWord(std::string word)
+{
+  soundex ex = soundify(word);
+  
+  dict[ex].insert(canonizetwo(word));
+}
+
+void malaprop::stats()
+{
+  for (std::map<soundex, std::set<std::string> >::iterator it = dict.begin(); it != dict.end(); it++)
+  {
+    printf("%c%03d (%d): ", it->first.prefix, it->first.code, it->second.size());
+    
+    for (std::set<std::string>::iterator jt = it->second.begin(); jt != it->second.end(); jt++)
+    {
+      std::cout << *jt << ", ";
+    }
+    
+    std::cout << std::endl;
+  }
+  
+  exit(0);
+}
+
+std::string malaprop::alternate(std::string word)
+{
+  soundex ex = soundify(word);
+  std::set<std::string>& opts = dict[ex];
+  int opt = rand() % opts.size();
+  for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++)
+  {
+    if (opt == 0)
+    {
+      return *it;
+    }
+    
+    opt--;
+  }
+  
+  return word;
+}
diff --git a/malaprop.h b/malaprop.h
new file mode 100644
index 0000000..91a18eb
--- /dev/null
+++ b/malaprop.h
@@ -0,0 +1,31 @@
+#ifndef MALAPROP_H_8F382336
+#define MALAPROP_H_8F382336
+
+#include <string>
+#include <map>
+#include <set>
+
+class malaprop
+{
+public:
+  void addWord(std::string word);
+  void stats();
+  std::string alternate(std::string word);
+  
+private:
+  struct soundex {
+    char prefix;
+    int code;
+    
+    bool operator<(const soundex& other) const
+    {
+      return (prefix < other.prefix) || (code < other.code);
+    }
+  };
+  
+  std::map<soundex, std::set<std::string> > dict;
+  
+  soundex soundify(std::string l);
+};
+
+#endif /* end of include guard: MALAPROP_H_8F382336 */
-- 
cgit 1.4.1