diff options
| -rw-r--r-- | Makefile.am | 4 | ||||
| -rw-r--r-- | ebooks.cpp | 2 | ||||
| -rw-r--r-- | freevars.cpp | 22 | ||||
| -rw-r--r-- | freevars.h | 8 | ||||
| -rw-r--r-- | gen.cpp | 40 | ||||
| -rw-r--r-- | kgramstats.cpp | 161 | ||||
| -rw-r--r-- | kgramstats.h | 15 | ||||
| -rw-r--r-- | malaprop.cpp | 127 | ||||
| -rw-r--r-- | malaprop.h | 31 |
9 files changed, 293 insertions, 117 deletions
| diff --git a/Makefile.am b/Makefile.am index 299dc10..5f6199b 100644 --- a/Makefile.am +++ b/Makefile.am | |||
| @@ -2,7 +2,7 @@ AUTOMAKE_OPTIONS = subdir-objects | |||
| 2 | ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} | 2 | ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} |
| 3 | 3 | ||
| 4 | bin_PROGRAMS = rawr-ebooks rawr-gen | 4 | bin_PROGRAMS = rawr-ebooks rawr-gen |
| 5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp | 5 | rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp |
| 6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp | 6 | rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp |
| 7 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) | 7 | rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) |
| 8 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file | 8 | rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file |
| diff --git a/ebooks.cpp b/ebooks.cpp index 8e46ee9..27065d9 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -12,8 +12,6 @@ | |||
| 12 | #include <yaml-cpp/yaml.h> | 12 | #include <yaml-cpp/yaml.h> |
| 13 | #include "freevars.h" | 13 | #include "freevars.h" |
| 14 | 14 | ||
| 15 | using namespace::std; | ||
| 16 | |||
| 17 | int main(int argc, char** args) | 15 | int main(int argc, char** args) |
| 18 | { | 16 | { |
| 19 | srand(time(NULL)); | 17 | srand(time(NULL)); |
| diff --git a/freevars.cpp b/freevars.cpp index 6472fef..8c3eda4 100644 --- a/freevars.cpp +++ b/freevars.cpp | |||
| @@ -4,17 +4,17 @@ | |||
| 4 | 4 | ||
| 5 | freevars::freevars() | 5 | freevars::freevars() |
| 6 | { | 6 | { |
| 7 | vars = new map<string, vector<string>* >(); | 7 | vars = new std::map<std::string, std::vector<std::string>* >(); |
| 8 | } | 8 | } |
| 9 | 9 | ||
| 10 | void freevars::addVar(string name, string filename) | 10 | void freevars::addVar(std::string name, std::string filename) |
| 11 | { | 11 | { |
| 12 | vector<string>* eltlist = new vector<string>(); | 12 | std::vector<std::string>* eltlist = new std::vector<std::string>(); |
| 13 | 13 | ||
| 14 | ifstream infile(filename.c_str()); | 14 | std::ifstream infile(filename.c_str()); |
| 15 | if (infile) | 15 | if (infile) |
| 16 | { | 16 | { |
| 17 | string line; | 17 | std::string line; |
| 18 | 18 | ||
| 19 | while (getline(infile, line)) | 19 | while (getline(infile, line)) |
| 20 | { | 20 | { |
| @@ -27,18 +27,18 @@ void freevars::addVar(string name, string filename) | |||
| 27 | (*vars)[name] = eltlist; | 27 | (*vars)[name] = eltlist; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | string freevars::parse(string in) | 30 | std::string freevars::parse(std::string in) |
| 31 | { | 31 | { |
| 32 | string res(in); | 32 | std::string res(in); |
| 33 | 33 | ||
| 34 | for (map<string, vector<string>* >::iterator it = vars->begin(); it != vars->end(); it++) | 34 | for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) |
| 35 | { | 35 | { |
| 36 | string tofind = "$" + it->first + "$"; | 36 | std::string tofind = "$" + it->first + "$"; |
| 37 | size_t fpos = res.find(tofind); | 37 | size_t fpos = res.find(tofind); |
| 38 | if (fpos != string::npos) | 38 | if (fpos != std::string::npos) |
| 39 | { | 39 | { |
| 40 | int r = rand() % it->second->size(); | 40 | int r = rand() % it->second->size(); |
| 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, string::npos); | 41 | res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos); |
| 42 | } | 42 | } |
| 43 | } | 43 | } |
| 44 | 44 | ||
| diff --git a/freevars.h b/freevars.h index 923f211..c92b9f5 100644 --- a/freevars.h +++ b/freevars.h | |||
| @@ -2,8 +2,6 @@ | |||
| 2 | #include <string> | 2 | #include <string> |
| 3 | #include <vector> | 3 | #include <vector> |
| 4 | 4 | ||
| 5 | using namespace::std; | ||
| 6 | |||
| 7 | #ifndef FREEVARS_H | 5 | #ifndef FREEVARS_H |
| 8 | #define FREEVARS_H | 6 | #define FREEVARS_H |
| 9 | 7 | ||
| @@ -11,11 +9,11 @@ class freevars | |||
| 11 | { | 9 | { |
| 12 | public: | 10 | public: |
| 13 | freevars(); | 11 | freevars(); |
| 14 | void addVar(string name, string filename); | 12 | void addVar(std::string name, std::string filename); |
| 15 | string parse(string in); | 13 | std::string parse(std::string in); |
| 16 | 14 | ||
| 17 | private: | 15 | private: |
| 18 | map<string, vector<string>* >* vars; | 16 | std::map<std::string, std::vector<std::string>* >* vars; |
| 19 | }; | 17 | }; |
| 20 | 18 | ||
| 21 | #endif \ No newline at end of file | 19 | #endif \ No newline at end of file |
| diff --git a/gen.cpp b/gen.cpp index 31ba4dc..3284ffa 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -9,65 +9,63 @@ | |||
| 9 | #include <iostream> | 9 | #include <iostream> |
| 10 | #include "freevars.h" | 10 | #include "freevars.h" |
| 11 | 11 | ||
| 12 | using namespace::std; | ||
| 13 | |||
| 14 | int main(int argc, char** args) | 12 | int main(int argc, char** args) |
| 15 | { | 13 | { |
| 16 | srand(time(NULL)); | 14 | srand(time(NULL)); |
| 17 | 15 | ||
| 18 | if (argc == 1) | 16 | if (argc == 1) |
| 19 | { | 17 | { |
| 20 | cout << "rawr-gen, version 1.0" << endl; | 18 | std::cout << "rawr-gen, version 1.0" << std::endl; |
| 21 | cout << "Usage: rawr-gen corpus-file" << endl; | 19 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; |
| 22 | cout << " where 'corpus-file' is the path to your input" << endl; | 20 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; |
| 23 | 21 | ||
| 24 | return 0; | 22 | return 0; |
| 25 | } | 23 | } |
| 26 | 24 | ||
| 27 | ifstream infile(args[1]); | 25 | std::ifstream infile(args[1]); |
| 28 | if (!infile) | 26 | if (!infile) |
| 29 | { | 27 | { |
| 30 | cout << "rawr-gen, version 1.0" << endl; | 28 | std::cout << "rawr-gen, version 1.0" << std::endl; |
| 31 | cout << "Usage: rawr-gen corpus-file" << endl; | 29 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; |
| 32 | cout << " where 'corpus-file' is the path to your input" << endl; | 30 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; |
| 33 | cout << endl; | 31 | std::cout << std::endl; |
| 34 | cout << "The file you specified does not exist." << endl; | 32 | std::cout << "The file you specified does not exist." << std::endl; |
| 35 | 33 | ||
| 36 | return 0; | 34 | return 0; |
| 37 | } | 35 | } |
| 38 | 36 | ||
| 39 | string corpus; | 37 | std::string corpus; |
| 40 | string line; | 38 | std::string line; |
| 41 | while (getline(infile, line)) | 39 | while (getline(infile, line)) |
| 42 | { | 40 | { |
| 43 | corpus += " " + line; | 41 | corpus += " " + line; |
| 44 | } | 42 | } |
| 45 | 43 | ||
| 46 | cout << "Preprocessing corpus..." << endl; | 44 | std::cout << "Preprocessing corpus..." << std::endl; |
| 47 | kgramstats* stats = new kgramstats(corpus, 3); | 45 | kgramstats* stats = new kgramstats(corpus, 3); |
| 48 | 46 | ||
| 49 | cout << "Preprocessing freevars..." << endl; | 47 | std::cout << "Preprocessing freevars..." << std::endl; |
| 50 | freevars* vars = new freevars(); | 48 | freevars* vars = new freevars(); |
| 51 | vars->addVar("name", "names.txt"); | 49 | vars->addVar("name", "names.txt"); |
| 52 | vars->addVar("noun", "nouns.txt"); | 50 | vars->addVar("noun", "nouns.txt"); |
| 53 | 51 | ||
| 54 | cout << "Generating..." << endl; | 52 | std::cout << "Generating..." << std::endl; |
| 55 | for (;;) | 53 | for (;;) |
| 56 | { | 54 | { |
| 57 | vector<string> doc = stats->randomSentence(rand() % 35 + 15); | 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); |
| 58 | string hi; | 56 | std::string hi; |
| 59 | for (vector<string>::iterator it = doc.begin(); it != doc.end(); ++it) | 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
| 60 | { | 58 | { |
| 61 | hi += vars->parse(*it) + " "; | 59 | hi += vars->parse(*it) + " "; |
| 62 | } | 60 | } |
| 63 | 61 | ||
| 64 | size_t lastperiod = hi.find_last_of("."); | 62 | size_t lastperiod = hi.find_last_of("."); |
| 65 | if ((lastperiod != string::npos) && (rand() % 3 > 0)) | 63 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) |
| 66 | { | 64 | { |
| 67 | hi = hi.substr(0, lastperiod+1); | 65 | hi = hi.substr(0, lastperiod+1); |
| 68 | } | 66 | } |
| 69 | 67 | ||
| 70 | cout << hi << endl; | 68 | std::cout << hi << std::endl; |
| 71 | 69 | ||
| 72 | getc(stdin); | 70 | getc(stdin); |
| 73 | } | 71 | } |
| diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -3,31 +3,35 @@ | |||
| 3 | #include <iostream> | 3 | #include <iostream> |
| 4 | #include <cstdlib> | 4 | #include <cstdlib> |
| 5 | #include <algorithm> | 5 | #include <algorithm> |
| 6 | #include "malaprop.h" | ||
| 7 | |||
| 8 | std::string canonize(std::string f); | ||
| 6 | 9 | ||
| 7 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 10 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
| 8 | // We consider maxK to be fairly constant | 11 | // We consider maxK to be fairly constant |
| 9 | kgramstats::kgramstats(string corpus, int maxK) | 12 | kgramstats::kgramstats(std::string corpus, int maxK) |
| 10 | { | 13 | { |
| 11 | this->maxK = maxK; | 14 | this->maxK = maxK; |
| 12 | 15 | ||
| 13 | vector<string> tokens; | 16 | std::vector<std::string> tokens; |
| 14 | int start = 0; | 17 | size_t start = 0; |
| 15 | int end = 0; | 18 | int end = 0; |
| 16 | 19 | ||
| 17 | while (end != string::npos) | 20 | while (end != std::string::npos) |
| 18 | { | 21 | { |
| 19 | end = corpus.find(" ", start); | 22 | end = corpus.find(" ", start); |
| 20 | 23 | ||
| 21 | string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); | 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
| 22 | if (token.compare("")) | 25 | if (token.compare("")) |
| 23 | { | 26 | { |
| 27 | mstats.addWord(token); | ||
| 24 | tokens.push_back(token); | 28 | tokens.push_back(token); |
| 25 | } | 29 | } |
| 26 | 30 | ||
| 27 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 31 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 28 | } | 32 | } |
| 29 | 33 | ||
| 30 | map<kgram, map<string, token_data*>* > tstats; | 34 | std::map<kgram, std::map<std::string, token_data*>* > tstats; |
| 31 | bool newSentence = true; | 35 | bool newSentence = true; |
| 32 | bool newClause = false; | 36 | bool newClause = false; |
| 33 | for (int k=0; k<=maxK; k++) | 37 | for (int k=0; k<=maxK; k++) |
| @@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 35 | for (int i=0; i<(tokens.size() - k); i++) | 39 | for (int i=0; i<(tokens.size() - k); i++) |
| 36 | { | 40 | { |
| 37 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 41 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
| 38 | transform(seq.begin(), seq.end(), seq.begin(), canonize); | 42 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); |
| 39 | string f = tokens[i+k]; | 43 | std::string f = tokens[i+k]; |
| 40 | string canonical = canonize(f); | 44 | std::string canonical = canonize(f); |
| 41 | 45 | ||
| 42 | if (tstats[seq] == NULL) | 46 | if (tstats[seq] == NULL) |
| 43 | { | 47 | { |
| 44 | tstats[seq] = new map<string, token_data*>(); | 48 | tstats[seq] = new std::map<std::string, token_data*>(); |
| 45 | } | 49 | } |
| 46 | 50 | ||
| 47 | if ((*tstats[seq])[canonical] == NULL) | 51 | if ((*tstats[seq])[canonical] == NULL) |
| @@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 50 | } | 54 | } |
| 51 | 55 | ||
| 52 | token_data* td = tstats[seq]->at(canonical); | 56 | token_data* td = tstats[seq]->at(canonical); |
| 53 | td->token = new string(canonical); | 57 | td->token = new std::string(canonical); |
| 54 | td->all++; | 58 | td->all++; |
| 55 | 59 | ||
| 56 | if (newSentence) | 60 | if (newSentence) |
| @@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 58 | kgram newKgram(1, "."); | 62 | kgram newKgram(1, "."); |
| 59 | if (tstats[newKgram] == NULL) | 63 | if (tstats[newKgram] == NULL) |
| 60 | { | 64 | { |
| 61 | tstats[newKgram] = new map<string, token_data*>(); | 65 | tstats[newKgram] = new std::map<std::string, token_data*>(); |
| 62 | } | 66 | } |
| 63 | 67 | ||
| 64 | (*tstats[newKgram])[canonical] = td; | 68 | (*tstats[newKgram])[canonical] = td; |
| @@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 71 | kgram commaKgram(1, ","); | 75 | kgram commaKgram(1, ","); |
| 72 | if (tstats[commaKgram] == NULL) | 76 | if (tstats[commaKgram] == NULL) |
| 73 | { | 77 | { |
| 74 | tstats[commaKgram] = new map<string, token_data*>(); | 78 | tstats[commaKgram] = new std::map<std::string, token_data*>(); |
| 75 | } | 79 | } |
| 76 | 80 | ||
| 77 | (*tstats[commaKgram])[canonical] = td; | 81 | (*tstats[commaKgram])[canonical] = td; |
| @@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 164 | } | 168 | } |
| 165 | } | 169 | } |
| 166 | 170 | ||
| 167 | stats = new map<kgram, map<int, token_data*>* >(); | 171 | stats = new std::map<kgram, std::map<int, token_data*>* >(); |
| 168 | for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) | 172 | for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) |
| 169 | { | 173 | { |
| 170 | kgram klist = it->first; | 174 | kgram klist = it->first; |
| 171 | map<string, token_data*>* probtable = it->second; | 175 | std::map<std::string, token_data*>* probtable = it->second; |
| 172 | map<int, token_data*>* distribution = new map<int, token_data*>(); | 176 | std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); |
| 173 | int max = 0; | 177 | int max = 0; |
| 174 | 178 | ||
| 175 | for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) | 179 | for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) |
| 176 | { | 180 | { |
| 177 | max += kt->second->all; | 181 | max += kt->second->all; |
| 178 | 182 | ||
| @@ -187,17 +191,17 @@ void printKgram(kgram k) | |||
| 187 | { | 191 | { |
| 188 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 192 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
| 189 | { | 193 | { |
| 190 | cout << *it << " "; | 194 | std::cout << *it << " "; |
| 191 | } | 195 | } |
| 192 | } | 196 | } |
| 193 | 197 | ||
| 194 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 198 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
| 195 | vector<string> kgramstats::randomSentence(int n) | 199 | std::vector<std::string> kgramstats::randomSentence(int n) |
| 196 | { | 200 | { |
| 197 | vector<string> result; | 201 | std::vector<std::string> result; |
| 198 | kgram newKgram(1, "."); | 202 | kgram newKgram(1, "."); |
| 199 | kgram commaKgram(1, ","); | 203 | kgram commaKgram(1, ","); |
| 200 | list<string> cur = newKgram; | 204 | std::list<std::string> cur = newKgram; |
| 201 | int cuts = 0; | 205 | int cuts = 0; |
| 202 | 206 | ||
| 203 | for (int i=0; i<n; i++) | 207 | for (int i=0; i<n; i++) |
| @@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 221 | cuts++; | 225 | cuts++; |
| 222 | } | 226 | } |
| 223 | 227 | ||
| 224 | map<int, token_data*> distribution = *(*stats)[cur]; | 228 | std::map<int, token_data*> distribution = *(*stats)[cur]; |
| 225 | int max = distribution.rbegin()->first; | 229 | int max = distribution.rbegin()->first; |
| 226 | int r = rand() % max; | 230 | int r = rand() % max; |
| 227 | token_data* next = distribution.upper_bound(r)->second; | 231 | token_data* next = distribution.upper_bound(r)->second; |
| 228 | 232 | ||
| 229 | string nextToken(*(next->token)); | 233 | std::string nextToken(*(next->token)); |
| 230 | int casing = rand() % next->all; | 234 | int casing = rand() % next->all; |
| 231 | int period = rand() % next->all; | 235 | int period = rand() % next->all; |
| 232 | int startparen = rand() % next->all; | 236 | int startparen = rand() % next->all; |
| @@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 236 | int comma = rand() % next->all; | 240 | int comma = rand() % next->all; |
| 237 | if (casing < next->uppercase) | 241 | if (casing < next->uppercase) |
| 238 | { | 242 | { |
| 239 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 243 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); |
| 240 | } else if ((casing - next->uppercase) < next->titlecase) | 244 | } else if ((casing - next->uppercase) < next->titlecase) |
| 241 | { | 245 | { |
| 242 | nextToken[0] = toupper(nextToken[0]); | 246 | nextToken[0] = toupper(nextToken[0]); |
| @@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 246 | { | 250 | { |
| 247 | nextToken[0] = toupper(nextToken[0]); | 251 | nextToken[0] = toupper(nextToken[0]); |
| 248 | } | 252 | } |
| 249 | /* | 253 | |
| 250 | if (startquote < next->startquote) | 254 | bool mess = (rand() % 100) == 0; |
| 251 | { | 255 | if (mess) |
| 252 | nextToken = "\"" + nextToken; | ||
| 253 | } else if (startparen < next->startparen) | ||
| 254 | { | 256 | { |
| 255 | nextToken = "(" + nextToken; | 257 | nextToken = mstats.alternate(nextToken); |
| 256 | } | 258 | |
| 257 | 259 | if (startquote < next->startquote) | |
| 258 | if (period < next->period) | ||
| 259 | { | ||
| 260 | if (endquote < next->endquote) | ||
| 261 | { | 260 | { |
| 262 | nextToken += "\""; | 261 | nextToken = "\"" + nextToken; |
| 263 | } else if (endparen < next->endparen) | 262 | } else if (startparen < next->startparen) |
| 264 | { | 263 | { |
| 265 | nextToken += ")"; | 264 | nextToken = "(" + nextToken; |
| 266 | } | 265 | } |
| 266 | |||
| 267 | if (period < next->period) | ||
| 268 | { | ||
| 269 | if (endquote < next->endquote) | ||
| 270 | { | ||
| 271 | nextToken += "\""; | ||
| 272 | } else if (endparen < next->endparen) | ||
| 273 | { | ||
| 274 | nextToken += ")"; | ||
| 275 | } | ||
| 267 | 276 | ||
| 268 | int type = rand() % 6; | 277 | int type = rand() % 6; |
| 269 | 278 | ||
| 270 | if (type < 3) | 279 | if (type < 3) |
| 271 | { | 280 | { |
| 272 | nextToken += "."; | 281 | nextToken += "."; |
| 273 | } else if (type < 5) | 282 | } else if (type < 5) |
| 274 | { | 283 | { |
| 275 | nextToken += "!"; | 284 | nextToken += "!"; |
| 276 | } else { | 285 | } else { |
| 277 | nextToken += "?"; | 286 | nextToken += "?"; |
| 278 | } | 287 | } |
| 279 | } else if (comma < next->comma) | 288 | } else if (comma < next->comma) |
| 280 | { | ||
| 281 | if (endquote < next->endquote) | ||
| 282 | { | ||
| 283 | nextToken += "\""; | ||
| 284 | } else if (endparen < next->endparen) | ||
| 285 | { | 289 | { |
| 286 | nextToken += ")"; | 290 | if (endquote < next->endquote) |
| 287 | } | 291 | { |
| 292 | nextToken += "\""; | ||
| 293 | } else if (endparen < next->endparen) | ||
| 294 | { | ||
| 295 | nextToken += ")"; | ||
| 296 | } | ||
| 288 | 297 | ||
| 289 | nextToken += ","; | 298 | nextToken += ","; |
| 299 | } | ||
| 290 | } | 300 | } |
| 291 | */ | 301 | |
| 292 | if (cur.size() == maxK) | 302 | if (cur.size() == maxK) |
| 293 | { | 303 | { |
| 294 | cur.pop_front(); | 304 | cur.pop_front(); |
| @@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 297 | /* DEBUG */ | 307 | /* DEBUG */ |
| 298 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) | 308 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) |
| 299 | { | 309 | { |
| 300 | cout << *it << " "; | 310 | std::cout << *it << " "; |
| 301 | } | 311 | } |
| 302 | 312 | ||
| 303 | cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; | 313 | std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; |
| 314 | |||
| 315 | if (mess) | ||
| 316 | { | ||
| 317 | std::cout << " mala " << *(next->token); | ||
| 318 | } | ||
| 319 | |||
| 320 | std::cout << std::endl; | ||
| 304 | 321 | ||
| 305 | if ((cur == newKgram) || (cur == commaKgram)) | 322 | if ((cur == newKgram) || (cur == commaKgram)) |
| 306 | { | 323 | { |
| @@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 314 | { | 331 | { |
| 315 | cur = commaKgram; | 332 | cur = commaKgram; |
| 316 | } else { | 333 | } else { |
| 317 | cur.push_back(*(next->token)); | 334 | //if (mess && (rand() % 2 == 0)) |
| 335 | if (false) | ||
| 336 | { | ||
| 337 | // This doesn't work because sometimes the alternate token isn't actually present in the original corpus | ||
| 338 | cur.clear(); | ||
| 339 | cur.push_back(nextToken); | ||
| 340 | } else { | ||
| 341 | cur.push_back(*(next->token)); | ||
| 342 | } | ||
| 318 | } | 343 | } |
| 319 | 344 | ||
| 320 | result.push_back(nextToken); | 345 | result.push_back(nextToken); |
| @@ -330,11 +355,11 @@ bool removeIf(char c) | |||
| 330 | 355 | ||
| 331 | std::string canonize(std::string f) | 356 | std::string canonize(std::string f) |
| 332 | { | 357 | { |
| 333 | string canonical(f); | 358 | std::string canonical(f); |
| 334 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 359 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); |
| 335 | 360 | ||
| 336 | string result; | 361 | std::string result; |
| 337 | remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 362 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
| 338 | 363 | ||
| 339 | return canonical; | 364 | return canonical; |
| 340 | } | 365 | } |
| diff --git a/kgramstats.h b/kgramstats.h index 059eb05..b01dece 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -2,19 +2,18 @@ | |||
| 2 | #include <map> | 2 | #include <map> |
| 3 | #include <list> | 3 | #include <list> |
| 4 | #include <vector> | 4 | #include <vector> |
| 5 | 5 | #include "malaprop.h" | |
| 6 | using namespace::std; | ||
| 7 | 6 | ||
| 8 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 9 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 10 | 9 | ||
| 11 | typedef list<string> kgram; | 10 | typedef std::list<std::string> kgram; |
| 12 | 11 | ||
| 13 | class kgramstats | 12 | class kgramstats |
| 14 | { | 13 | { |
| 15 | public: | 14 | public: |
| 16 | kgramstats(string corpus, int maxK); | 15 | kgramstats(std::string corpus, int maxK); |
| 17 | vector<string> randomSentence(int n); | 16 | std::vector<std::string> randomSentence(int n); |
| 18 | 17 | ||
| 19 | private: | 18 | private: |
| 20 | typedef struct | 19 | typedef struct |
| @@ -28,13 +27,13 @@ private: | |||
| 28 | int startparen; | 27 | int startparen; |
| 29 | int endparen; | 28 | int endparen; |
| 30 | int comma; | 29 | int comma; |
| 31 | string* token; | 30 | std::string* token; |
| 32 | } token_data; | 31 | } token_data; |
| 33 | int maxK; | 32 | int maxK; |
| 34 | map<kgram, map<int, token_data*>* >* stats; | 33 | std::map<kgram, std::map<int, token_data*>* >* stats; |
| 34 | malaprop mstats; | ||
| 35 | }; | 35 | }; |
| 36 | 36 | ||
| 37 | void printKgram(kgram k); | 37 | void printKgram(kgram k); |
| 38 | std::string canonize(std::string f); | ||
| 39 | 38 | ||
| 40 | #endif \ No newline at end of file | 39 | #endif \ No newline at end of file |
| diff --git a/malaprop.cpp b/malaprop.cpp new file mode 100644 index 0000000..bfea579 --- /dev/null +++ b/malaprop.cpp | |||
| @@ -0,0 +1,127 @@ | |||
| 1 | #include "malaprop.h" | ||
| 2 | #include <cstdlib> | ||
| 3 | #include <iostream> | ||
| 4 | |||
| 5 | bool removeIfM(char c) | ||
| 6 | { | ||
| 7 | return !isalpha(c); | ||
| 8 | } | ||
| 9 | |||
| 10 | char soundID(char l) | ||
| 11 | { | ||
| 12 | switch (l) | ||
| 13 | { | ||
| 14 | case 'b': | ||
| 15 | case 'f': | ||
| 16 | case 'p': | ||
| 17 | case 'v': | ||
| 18 | return '1'; | ||
| 19 | |||
| 20 | case 'c': | ||
| 21 | case 'g': | ||
| 22 | case 'j': | ||
| 23 | case 'k': | ||
| 24 | case 'q': | ||
| 25 | case 's': | ||
| 26 | case 'x': | ||
| 27 | case 'z': | ||
| 28 | return '2'; | ||
| 29 | |||
| 30 | case 'd': | ||
| 31 | case 't': | ||
| 32 | return '3'; | ||
| 33 | |||
| 34 | case 'l': | ||
| 35 | return '4'; | ||
| 36 | |||
| 37 | case 'm': | ||
| 38 | case 'n': | ||
| 39 | return '5'; | ||
| 40 | |||
| 41 | case 'r': | ||
| 42 | return '6'; | ||
| 43 | } | ||
| 44 | |||
| 45 | return l; | ||
| 46 | } | ||
| 47 | |||
| 48 | std::string canonizetwo(std::string f) | ||
| 49 | { | ||
| 50 | std::string canonical(f); | ||
| 51 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
| 52 | |||
| 53 | std::string result; | ||
| 54 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIfM); | ||
| 55 | |||
| 56 | return result; | ||
| 57 | } | ||
| 58 | |||
| 59 | malaprop::soundex malaprop::soundify(std::string f) | ||
| 60 | { | ||
| 61 | std::string result(canonizetwo(f)); | ||
| 62 | |||
| 63 | soundex ex; | ||
| 64 | ex.prefix = result[0]; | ||
| 65 | |||
| 66 | std::string output; | ||
| 67 | |||
| 68 | for (int i = 1; i<result.length(); i++) | ||
| 69 | { | ||
| 70 | int c = soundID(result[i]); | ||
| 71 | if ( | ||
| 72 | (isdigit(c)) // Not a vowel | ||
| 73 | && (c != soundID(result[i-1])) // Not the same as the previous character | ||
| 74 | && ((i < 2) || ((result[i-1] = 'h' || result[i-1] == 'w') && (c != soundID(result[i-2])))) // Not same as before h/w | ||
| 75 | ) | ||
| 76 | { | ||
| 77 | output += c; | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | output.resize(3, '0'); | ||
| 82 | ex.code = atoi(output.c_str()); | ||
| 83 | |||
| 84 | return ex; | ||
| 85 | } | ||
| 86 | |||
| 87 | void malaprop::addWord(std::string word) | ||
| 88 | { | ||
| 89 | soundex ex = soundify(word); | ||
| 90 | |||
| 91 | dict[ex].insert(canonizetwo(word)); | ||
| 92 | } | ||
| 93 | |||
| 94 | void malaprop::stats() | ||
| 95 | { | ||
| 96 | for (std::map<soundex, std::set<std::string> >::iterator it = dict.begin(); it != dict.end(); it++) | ||
| 97 | { | ||
| 98 | printf("%c%03d (%d): ", it->first.prefix, it->first.code, it->second.size()); | ||
| 99 | |||
| 100 | for (std::set<std::string>::iterator jt = it->second.begin(); jt != it->second.end(); jt++) | ||
| 101 | { | ||
| 102 | std::cout << *jt << ", "; | ||
| 103 | } | ||
| 104 | |||
| 105 | std::cout << std::endl; | ||
| 106 | } | ||
| 107 | |||
| 108 | exit(0); | ||
| 109 | } | ||
| 110 | |||
| 111 | std::string malaprop::alternate(std::string word) | ||
| 112 | { | ||
| 113 | soundex ex = soundify(word); | ||
| 114 | std::set<std::string>& opts = dict[ex]; | ||
| 115 | int opt = rand() % opts.size(); | ||
| 116 | for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++) | ||
| 117 | { | ||
| 118 | if (opt == 0) | ||
| 119 | { | ||
| 120 | return *it; | ||
| 121 | } | ||
| 122 | |||
| 123 | opt--; | ||
| 124 | } | ||
| 125 | |||
| 126 | return word; | ||
| 127 | } | ||
| diff --git a/malaprop.h b/malaprop.h new file mode 100644 index 0000000..91a18eb --- /dev/null +++ b/malaprop.h | |||
| @@ -0,0 +1,31 @@ | |||
| 1 | #ifndef MALAPROP_H_8F382336 | ||
| 2 | #define MALAPROP_H_8F382336 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <map> | ||
| 6 | #include <set> | ||
| 7 | |||
| 8 | class malaprop | ||
| 9 | { | ||
| 10 | public: | ||
| 11 | void addWord(std::string word); | ||
| 12 | void stats(); | ||
| 13 | std::string alternate(std::string word); | ||
| 14 | |||
| 15 | private: | ||
| 16 | struct soundex { | ||
| 17 | char prefix; | ||
| 18 | int code; | ||
| 19 | |||
| 20 | bool operator<(const soundex& other) const | ||
| 21 | { | ||
| 22 | return (prefix < other.prefix) || (code < other.code); | ||
| 23 | } | ||
| 24 | }; | ||
| 25 | |||
| 26 | std::map<soundex, std::set<std::string> > dict; | ||
| 27 | |||
| 28 | soundex soundify(std::string l); | ||
| 29 | }; | ||
| 30 | |||
| 31 | #endif /* end of include guard: MALAPROP_H_8F382336 */ | ||
