From 01746a0e03267b6c082b58436c1370567f7cb7c5 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sun, 22 Nov 2015 18:49:58 -0500 Subject: Added malapropisms --- Makefile.am | 4 +- ebooks.cpp | 2 - freevars.cpp | 22 ++++---- freevars.h | 8 ++- gen.cpp | 40 +++++++------- kgramstats.cpp | 161 +++++++++++++++++++++++++++++++++------------------------ kgramstats.h | 15 +++--- malaprop.cpp | 127 +++++++++++++++++++++++++++++++++++++++++++++ malaprop.h | 31 +++++++++++ 9 files changed, 293 insertions(+), 117 deletions(-) create mode 100644 malaprop.cpp create mode 100644 malaprop.h diff --git a/Makefile.am b/Makefile.am index 299dc10..5f6199b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -2,7 +2,7 @@ AUTOMAKE_OPTIONS = subdir-objects ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} bin_PROGRAMS = rawr-ebooks rawr-gen -rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp -rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp +rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp +rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file diff --git a/ebooks.cpp b/ebooks.cpp index 8e46ee9..27065d9 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -12,8 +12,6 @@ #include #include "freevars.h" -using namespace::std; - int main(int argc, char** args) { srand(time(NULL)); diff --git a/freevars.cpp b/freevars.cpp index 6472fef..8c3eda4 100644 --- a/freevars.cpp +++ b/freevars.cpp @@ -4,17 +4,17 @@ freevars::freevars() { - vars = new map* >(); + vars = new std::map* >(); } -void freevars::addVar(string name, string filename) +void freevars::addVar(std::string name, std::string filename) { - vector* eltlist = new vector(); + std::vector* eltlist = new std::vector(); - ifstream infile(filename.c_str()); + std::ifstream infile(filename.c_str()); if (infile) { - string line; + std::string line; while (getline(infile, line)) { @@ -27,18 +27,18 @@ void freevars::addVar(string name, string filename) (*vars)[name] = eltlist; } -string freevars::parse(string in) +std::string freevars::parse(std::string in) { - string res(in); + std::string res(in); - for (map* >::iterator it = vars->begin(); it != vars->end(); it++) + for (std::map* >::iterator it = vars->begin(); it != vars->end(); it++) { - string tofind = "$" + it->first + "$"; + std::string tofind = "$" + it->first + "$"; size_t fpos = res.find(tofind); - if (fpos != string::npos) + if (fpos != std::string::npos) { int r = rand() % it->second->size(); - res.replace(fpos, tofind.length(), (*it->second)[r], 0, string::npos); + res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); } } diff --git a/freevars.h b/freevars.h index 923f211..c92b9f5 100644 --- a/freevars.h +++ b/freevars.h @@ -2,8 +2,6 @@ #include #include -using namespace::std; - #ifndef FREEVARS_H #define FREEVARS_H @@ -11,11 +9,11 @@ class freevars { public: freevars(); - void addVar(string name, string filename); - string parse(string in); + void addVar(std::string name, std::string filename); + std::string parse(std::string in); private: - map* >* vars; + std::map* >* vars; }; #endif \ No newline at end of file diff --git a/gen.cpp b/gen.cpp index 31ba4dc..3284ffa 100644 --- a/gen.cpp +++ b/gen.cpp @@ -9,65 +9,63 @@ #include #include "freevars.h" -using namespace::std; - int main(int argc, char** args) { srand(time(NULL)); if (argc == 1) { - cout << "rawr-gen, version 1.0" << endl; - cout << "Usage: rawr-gen corpus-file" << endl; - cout << " where 'corpus-file' is the path to your input" << endl; + std::cout << "rawr-gen, version 1.0" << std::endl; + std::cout << "Usage: rawr-gen corpus-file" << std::endl; + std::cout << " where 'corpus-file' is the path to your input" << std::endl; return 0; } - ifstream infile(args[1]); + std::ifstream infile(args[1]); if (!infile) { - cout << "rawr-gen, version 1.0" << endl; - cout << "Usage: rawr-gen corpus-file" << endl; - cout << " where 'corpus-file' is the path to your input" << endl; - cout << endl; - cout << "The file you specified does not exist." << endl; + std::cout << "rawr-gen, version 1.0" << std::endl; + std::cout << "Usage: rawr-gen corpus-file" << std::endl; + std::cout << " where 'corpus-file' is the path to your input" << std::endl; + std::cout << std::endl; + std::cout << "The file you specified does not exist." << std::endl; return 0; } - string corpus; - string line; + std::string corpus; + std::string line; while (getline(infile, line)) { corpus += " " + line; } - cout << "Preprocessing corpus..." << endl; + std::cout << "Preprocessing corpus..." << std::endl; kgramstats* stats = new kgramstats(corpus, 3); - cout << "Preprocessing freevars..." << endl; + std::cout << "Preprocessing freevars..." << std::endl; freevars* vars = new freevars(); vars->addVar("name", "names.txt"); vars->addVar("noun", "nouns.txt"); - cout << "Generating..." << endl; + std::cout << "Generating..." << std::endl; for (;;) { - vector doc = stats->randomSentence(rand() % 35 + 15); - string hi; - for (vector::iterator it = doc.begin(); it != doc.end(); ++it) + std::vector doc = stats->randomSentence(rand() % 35 + 15); + std::string hi; + for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) { hi += vars->parse(*it) + " "; } size_t lastperiod = hi.find_last_of("."); - if ((lastperiod != string::npos) && (rand() % 3 > 0)) + if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) { hi = hi.substr(0, lastperiod+1); } - cout << hi << endl; + std::cout << hi << std::endl; getc(stdin); } diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -3,31 +3,35 @@ #include #include #include +#include "malaprop.h" + +std::string canonize(std::string f); // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant -kgramstats::kgramstats(string corpus, int maxK) +kgramstats::kgramstats(std::string corpus, int maxK) { this->maxK = maxK; - - vector tokens; - int start = 0; + + std::vector tokens; + size_t start = 0; int end = 0; - while (end != string::npos) + while (end != std::string::npos) { end = corpus.find(" ", start); - string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); + std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); if (token.compare("")) { + mstats.addWord(token); tokens.push_back(token); } - start = ((end > (string::npos - 1) ) ? string::npos : end + 1); + start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } - map* > tstats; + std::map* > tstats; bool newSentence = true; bool newClause = false; for (int k=0; k<=maxK; k++) @@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK) for (int i=0; i<(tokens.size() - k); i++) { kgram seq(tokens.begin()+i, tokens.begin()+i+k); - transform(seq.begin(), seq.end(), seq.begin(), canonize); - string f = tokens[i+k]; - string canonical = canonize(f); + std::transform(seq.begin(), seq.end(), seq.begin(), canonize); + std::string f = tokens[i+k]; + std::string canonical = canonize(f); if (tstats[seq] == NULL) { - tstats[seq] = new map(); + tstats[seq] = new std::map(); } if ((*tstats[seq])[canonical] == NULL) @@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK) } token_data* td = tstats[seq]->at(canonical); - td->token = new string(canonical); + td->token = new std::string(canonical); td->all++; if (newSentence) @@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK) kgram newKgram(1, "."); if (tstats[newKgram] == NULL) { - tstats[newKgram] = new map(); + tstats[newKgram] = new std::map(); } (*tstats[newKgram])[canonical] = td; @@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK) kgram commaKgram(1, ","); if (tstats[commaKgram] == NULL) { - tstats[commaKgram] = new map(); + tstats[commaKgram] = new std::map(); } (*tstats[commaKgram])[canonical] = td; @@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK) } } - stats = new map* >(); - for (map* >::iterator it = tstats.begin(); it != tstats.end(); it++) + stats = new std::map* >(); + for (std::map* >::iterator it = tstats.begin(); it != tstats.end(); it++) { kgram klist = it->first; - map* probtable = it->second; - map* distribution = new map(); + std::map* probtable = it->second; + std::map* distribution = new std::map(); int max = 0; - for (map::iterator kt = probtable->begin(); kt != probtable->end(); kt++) + for (std::map::iterator kt = probtable->begin(); kt != probtable->end(); kt++) { max += kt->second->all; @@ -187,17 +191,17 @@ void printKgram(kgram k) { for (kgram::iterator it = k.begin(); it != k.end(); it++) { - cout << *it << " "; + std::cout << *it << " "; } } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus -vector kgramstats::randomSentence(int n) +std::vector kgramstats::randomSentence(int n) { - vector result; + std::vector result; kgram newKgram(1, "."); kgram commaKgram(1, ","); - list cur = newKgram; + std::list cur = newKgram; int cuts = 0; for (int i=0; i kgramstats::randomSentence(int n) cuts++; } - map distribution = *(*stats)[cur]; + std::map distribution = *(*stats)[cur]; int max = distribution.rbegin()->first; int r = rand() % max; token_data* next = distribution.upper_bound(r)->second; - string nextToken(*(next->token)); + std::string nextToken(*(next->token)); int casing = rand() % next->all; int period = rand() % next->all; int startparen = rand() % next->all; @@ -236,7 +240,7 @@ vector kgramstats::randomSentence(int n) int comma = rand() % next->all; if (casing < next->uppercase) { - transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); + std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); } else if ((casing - next->uppercase) < next->titlecase) { nextToken[0] = toupper(nextToken[0]); @@ -246,49 +250,55 @@ vector kgramstats::randomSentence(int n) { nextToken[0] = toupper(nextToken[0]); } - /* - if (startquote < next->startquote) - { - nextToken = "\"" + nextToken; - } else if (startparen < next->startparen) + + bool mess = (rand() % 100) == 0; + if (mess) { - nextToken = "(" + nextToken; - } - - if (period < next->period) - { - if (endquote < next->endquote) + nextToken = mstats.alternate(nextToken); + + if (startquote < next->startquote) { - nextToken += "\""; - } else if (endparen < next->endparen) + nextToken = "\"" + nextToken; + } else if (startparen < next->startparen) { - nextToken += ")"; + nextToken = "(" + nextToken; } + + if (period < next->period) + { + if (endquote < next->endquote) + { + nextToken += "\""; + } else if (endparen < next->endparen) + { + nextToken += ")"; + } - int type = rand() % 6; + int type = rand() % 6; - if (type < 3) - { - nextToken += "."; - } else if (type < 5) - { - nextToken += "!"; - } else { - nextToken += "?"; - } - } else if (comma < next->comma) - { - if (endquote < next->endquote) - { - nextToken += "\""; - } else if (endparen < next->endparen) + if (type < 3) + { + nextToken += "."; + } else if (type < 5) + { + nextToken += "!"; + } else { + nextToken += "?"; + } + } else if (comma < next->comma) { - nextToken += ")"; - } + if (endquote < next->endquote) + { + nextToken += "\""; + } else if (endparen < next->endparen) + { + nextToken += ")"; + } - nextToken += ","; + nextToken += ","; + } } -*/ + if (cur.size() == maxK) { cur.pop_front(); @@ -297,10 +307,17 @@ vector kgramstats::randomSentence(int n) /* DEBUG */ for (kgram::iterator it = cur.begin(); it != cur.end(); it++) { - cout << *it << " "; + std::cout << *it << " "; } - cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; + std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; + + if (mess) + { + std::cout << " mala " << *(next->token); + } + + std::cout << std::endl; if ((cur == newKgram) || (cur == commaKgram)) { @@ -314,7 +331,15 @@ vector kgramstats::randomSentence(int n) { cur = commaKgram; } else { - cur.push_back(*(next->token)); + //if (mess && (rand() % 2 == 0)) + if (false) + { + // This doesn't work because sometimes the alternate token isn't actually present in the original corpus + cur.clear(); + cur.push_back(nextToken); + } else { + cur.push_back(*(next->token)); + } } result.push_back(nextToken); @@ -330,11 +355,11 @@ bool removeIf(char c) std::string canonize(std::string f) { - string canonical(f); - transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); + std::string canonical(f); + std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); - string result; - remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); + std::string result; + std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); return canonical; } diff --git a/kgramstats.h b/kgramstats.h index 059eb05..b01dece 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -2,19 +2,18 @@ #include #include #include - -using namespace::std; +#include "malaprop.h" #ifndef KGRAMSTATS_H #define KGRAMSTATS_H -typedef list kgram; +typedef std::list kgram; class kgramstats { public: - kgramstats(string corpus, int maxK); - vector randomSentence(int n); + kgramstats(std::string corpus, int maxK); + std::vector randomSentence(int n); private: typedef struct @@ -28,13 +27,13 @@ private: int startparen; int endparen; int comma; - string* token; + std::string* token; } token_data; int maxK; - map* >* stats; + std::map* >* stats; + malaprop mstats; }; void printKgram(kgram k); -std::string canonize(std::string f); #endif \ No newline at end of file diff --git a/malaprop.cpp b/malaprop.cpp new file mode 100644 index 0000000..bfea579 --- /dev/null +++ b/malaprop.cpp @@ -0,0 +1,127 @@ +#include "malaprop.h" +#include +#include + +bool removeIfM(char c) +{ + return !isalpha(c); +} + +char soundID(char l) +{ + switch (l) + { + case 'b': + case 'f': + case 'p': + case 'v': + return '1'; + + case 'c': + case 'g': + case 'j': + case 'k': + case 'q': + case 's': + case 'x': + case 'z': + return '2'; + + case 'd': + case 't': + return '3'; + + case 'l': + return '4'; + + case 'm': + case 'n': + return '5'; + + case 'r': + return '6'; + } + + return l; +} + +std::string canonizetwo(std::string f) +{ + std::string canonical(f); + std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); + + std::string result; + std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIfM); + + return result; +} + +malaprop::soundex malaprop::soundify(std::string f) +{ + std::string result(canonizetwo(f)); + + soundex ex; + ex.prefix = result[0]; + + std::string output; + + for (int i = 1; i >::iterator it = dict.begin(); it != dict.end(); it++) + { + printf("%c%03d (%d): ", it->first.prefix, it->first.code, it->second.size()); + + for (std::set::iterator jt = it->second.begin(); jt != it->second.end(); jt++) + { + std::cout << *jt << ", "; + } + + std::cout << std::endl; + } + + exit(0); +} + +std::string malaprop::alternate(std::string word) +{ + soundex ex = soundify(word); + std::set& opts = dict[ex]; + int opt = rand() % opts.size(); + for (std::set::iterator it = opts.begin(); it != opts.end(); it++) + { + if (opt == 0) + { + return *it; + } + + opt--; + } + + return word; +} diff --git a/malaprop.h b/malaprop.h new file mode 100644 index 0000000..91a18eb --- /dev/null +++ b/malaprop.h @@ -0,0 +1,31 @@ +#ifndef MALAPROP_H_8F382336 +#define MALAPROP_H_8F382336 + +#include +#include +#include + +class malaprop +{ +public: + void addWord(std::string word); + void stats(); + std::string alternate(std::string word); + +private: + struct soundex { + char prefix; + int code; + + bool operator<(const soundex& other) const + { + return (prefix < other.prefix) || (code < other.code); + } + }; + + std::map > dict; + + soundex soundify(std::string l); +}; + +#endif /* end of include guard: MALAPROP_H_8F382336 */ -- cgit 1.4.1