about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--Makefile.am4
-rw-r--r--ebooks.cpp2
-rw-r--r--freevars.cpp22
-rw-r--r--freevars.h8
-rw-r--r--gen.cpp40
-rw-r--r--kgramstats.cpp161
-rw-r--r--kgramstats.h15
-rw-r--r--malaprop.cpp127
-rw-r--r--malaprop.h31
9 files changed, 293 insertions, 117 deletions
diff --git a/Makefile.am b/Makefile.am index 299dc10..5f6199b 100644 --- a/Makefile.am +++ b/Makefile.am
@@ -2,7 +2,7 @@ AUTOMAKE_OPTIONS = subdir-objects
2ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} 2ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
3 3
4bin_PROGRAMS = rawr-ebooks rawr-gen 4bin_PROGRAMS = rawr-ebooks rawr-gen
5rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp 5rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp malaprop.cpp
6rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp 6rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp malaprop.cpp
7rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) 7rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS)
8rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file 8rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file
diff --git a/ebooks.cpp b/ebooks.cpp index 8e46ee9..27065d9 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -12,8 +12,6 @@
12#include <yaml-cpp/yaml.h> 12#include <yaml-cpp/yaml.h>
13#include "freevars.h" 13#include "freevars.h"
14 14
15using namespace::std;
16
17int main(int argc, char** args) 15int main(int argc, char** args)
18{ 16{
19 srand(time(NULL)); 17 srand(time(NULL));
diff --git a/freevars.cpp b/freevars.cpp index 6472fef..8c3eda4 100644 --- a/freevars.cpp +++ b/freevars.cpp
@@ -4,17 +4,17 @@
4 4
5freevars::freevars() 5freevars::freevars()
6{ 6{
7 vars = new map<string, vector<string>* >(); 7 vars = new std::map<std::string, std::vector<std::string>* >();
8} 8}
9 9
10void freevars::addVar(string name, string filename) 10void freevars::addVar(std::string name, std::string filename)
11{ 11{
12 vector<string>* eltlist = new vector<string>(); 12 std::vector<std::string>* eltlist = new std::vector<std::string>();
13 13
14 ifstream infile(filename.c_str()); 14 std::ifstream infile(filename.c_str());
15 if (infile) 15 if (infile)
16 { 16 {
17 string line; 17 std::string line;
18 18
19 while (getline(infile, line)) 19 while (getline(infile, line))
20 { 20 {
@@ -27,18 +27,18 @@ void freevars::addVar(string name, string filename)
27 (*vars)[name] = eltlist; 27 (*vars)[name] = eltlist;
28} 28}
29 29
30string freevars::parse(string in) 30std::string freevars::parse(std::string in)
31{ 31{
32 string res(in); 32 std::string res(in);
33 33
34 for (map<string, vector<string>* >::iterator it = vars->begin(); it != vars->end(); it++) 34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++)
35 { 35 {
36 string tofind = "$" + it->first + "$"; 36 std::string tofind = "$" + it->first + "$";
37 size_t fpos = res.find(tofind); 37 size_t fpos = res.find(tofind);
38 if (fpos != string::npos) 38 if (fpos != std::string::npos)
39 { 39 {
40 int r = rand() % it->second->size(); 40 int r = rand() % it->second->size();
41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, string::npos); 41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
42 } 42 }
43 } 43 }
44 44
diff --git a/freevars.h b/freevars.h index 923f211..c92b9f5 100644 --- a/freevars.h +++ b/freevars.h
@@ -2,8 +2,6 @@
2#include <string> 2#include <string>
3#include <vector> 3#include <vector>
4 4
5using namespace::std;
6
7#ifndef FREEVARS_H 5#ifndef FREEVARS_H
8#define FREEVARS_H 6#define FREEVARS_H
9 7
@@ -11,11 +9,11 @@ class freevars
11{ 9{
12public: 10public:
13 freevars(); 11 freevars();
14 void addVar(string name, string filename); 12 void addVar(std::string name, std::string filename);
15 string parse(string in); 13 std::string parse(std::string in);
16 14
17private: 15private:
18 map<string, vector<string>* >* vars; 16 std::map<std::string, std::vector<std::string>* >* vars;
19}; 17};
20 18
21#endif \ No newline at end of file 19#endif \ No newline at end of file
diff --git a/gen.cpp b/gen.cpp index 31ba4dc..3284ffa 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -9,65 +9,63 @@
9#include <iostream> 9#include <iostream>
10#include "freevars.h" 10#include "freevars.h"
11 11
12using namespace::std;
13
14int main(int argc, char** args) 12int main(int argc, char** args)
15{ 13{
16 srand(time(NULL)); 14 srand(time(NULL));
17 15
18 if (argc == 1) 16 if (argc == 1)
19 { 17 {
20 cout << "rawr-gen, version 1.0" << endl; 18 std::cout << "rawr-gen, version 1.0" << std::endl;
21 cout << "Usage: rawr-gen corpus-file" << endl; 19 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
22 cout << " where 'corpus-file' is the path to your input" << endl; 20 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
23 21
24 return 0; 22 return 0;
25 } 23 }
26 24
27 ifstream infile(args[1]); 25 std::ifstream infile(args[1]);
28 if (!infile) 26 if (!infile)
29 { 27 {
30 cout << "rawr-gen, version 1.0" << endl; 28 std::cout << "rawr-gen, version 1.0" << std::endl;
31 cout << "Usage: rawr-gen corpus-file" << endl; 29 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
32 cout << " where 'corpus-file' is the path to your input" << endl; 30 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
33 cout << endl; 31 std::cout << std::endl;
34 cout << "The file you specified does not exist." << endl; 32 std::cout << "The file you specified does not exist." << std::endl;
35 33
36 return 0; 34 return 0;
37 } 35 }
38 36
39 string corpus; 37 std::string corpus;
40 string line; 38 std::string line;
41 while (getline(infile, line)) 39 while (getline(infile, line))
42 { 40 {
43 corpus += " " + line; 41 corpus += " " + line;
44 } 42 }
45 43
46 cout << "Preprocessing corpus..." << endl; 44 std::cout << "Preprocessing corpus..." << std::endl;
47 kgramstats* stats = new kgramstats(corpus, 3); 45 kgramstats* stats = new kgramstats(corpus, 3);
48 46
49 cout << "Preprocessing freevars..." << endl; 47 std::cout << "Preprocessing freevars..." << std::endl;
50 freevars* vars = new freevars(); 48 freevars* vars = new freevars();
51 vars->addVar("name", "names.txt"); 49 vars->addVar("name", "names.txt");
52 vars->addVar("noun", "nouns.txt"); 50 vars->addVar("noun", "nouns.txt");
53 51
54 cout << "Generating..." << endl; 52 std::cout << "Generating..." << std::endl;
55 for (;;) 53 for (;;)
56 { 54 {
57 vector<string> doc = stats->randomSentence(rand() % 35 + 15); 55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
58 string hi; 56 std::string hi;
59 for (vector<string>::iterator it = doc.begin(); it != doc.end(); ++it) 57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
60 { 58 {
61 hi += vars->parse(*it) + " "; 59 hi += vars->parse(*it) + " ";
62 } 60 }
63 61
64 size_t lastperiod = hi.find_last_of("."); 62 size_t lastperiod = hi.find_last_of(".");
65 if ((lastperiod != string::npos) && (rand() % 3 > 0)) 63 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
66 { 64 {
67 hi = hi.substr(0, lastperiod+1); 65 hi = hi.substr(0, lastperiod+1);
68 } 66 }
69 67
70 cout << hi << endl; 68 std::cout << hi << std::endl;
71 69
72 getc(stdin); 70 getc(stdin);
73 } 71 }
diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -3,31 +3,35 @@
3#include <iostream> 3#include <iostream>
4#include <cstdlib> 4#include <cstdlib>
5#include <algorithm> 5#include <algorithm>
6#include "malaprop.h"
7
8std::string canonize(std::string f);
6 9
7// runs in O(t^2) time where t is the number of tokens in the input corpus 10// runs in O(t^2) time where t is the number of tokens in the input corpus
8// We consider maxK to be fairly constant 11// We consider maxK to be fairly constant
9kgramstats::kgramstats(string corpus, int maxK) 12kgramstats::kgramstats(std::string corpus, int maxK)
10{ 13{
11 this->maxK = maxK; 14 this->maxK = maxK;
12 15
13 vector<string> tokens; 16 std::vector<std::string> tokens;
14 int start = 0; 17 size_t start = 0;
15 int end = 0; 18 int end = 0;
16 19
17 while (end != string::npos) 20 while (end != std::string::npos)
18 { 21 {
19 end = corpus.find(" ", start); 22 end = corpus.find(" ", start);
20 23
21 string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); 24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
22 if (token.compare("")) 25 if (token.compare(""))
23 { 26 {
27 mstats.addWord(token);
24 tokens.push_back(token); 28 tokens.push_back(token);
25 } 29 }
26 30
27 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 31 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
28 } 32 }
29 33
30 map<kgram, map<string, token_data*>* > tstats; 34 std::map<kgram, std::map<std::string, token_data*>* > tstats;
31 bool newSentence = true; 35 bool newSentence = true;
32 bool newClause = false; 36 bool newClause = false;
33 for (int k=0; k<=maxK; k++) 37 for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
35 for (int i=0; i<(tokens.size() - k); i++) 39 for (int i=0; i<(tokens.size() - k); i++)
36 { 40 {
37 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 41 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
38 transform(seq.begin(), seq.end(), seq.begin(), canonize); 42 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
39 string f = tokens[i+k]; 43 std::string f = tokens[i+k];
40 string canonical = canonize(f); 44 std::string canonical = canonize(f);
41 45
42 if (tstats[seq] == NULL) 46 if (tstats[seq] == NULL)
43 { 47 {
44 tstats[seq] = new map<string, token_data*>(); 48 tstats[seq] = new std::map<std::string, token_data*>();
45 } 49 }
46 50
47 if ((*tstats[seq])[canonical] == NULL) 51 if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
50 } 54 }
51 55
52 token_data* td = tstats[seq]->at(canonical); 56 token_data* td = tstats[seq]->at(canonical);
53 td->token = new string(canonical); 57 td->token = new std::string(canonical);
54 td->all++; 58 td->all++;
55 59
56 if (newSentence) 60 if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
58 kgram newKgram(1, "."); 62 kgram newKgram(1, ".");
59 if (tstats[newKgram] == NULL) 63 if (tstats[newKgram] == NULL)
60 { 64 {
61 tstats[newKgram] = new map<string, token_data*>(); 65 tstats[newKgram] = new std::map<std::string, token_data*>();
62 } 66 }
63 67
64 (*tstats[newKgram])[canonical] = td; 68 (*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
71 kgram commaKgram(1, ","); 75 kgram commaKgram(1, ",");
72 if (tstats[commaKgram] == NULL) 76 if (tstats[commaKgram] == NULL)
73 { 77 {
74 tstats[commaKgram] = new map<string, token_data*>(); 78 tstats[commaKgram] = new std::map<std::string, token_data*>();
75 } 79 }
76 80
77 (*tstats[commaKgram])[canonical] = td; 81 (*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
164 } 168 }
165 } 169 }
166 170
167 stats = new map<kgram, map<int, token_data*>* >(); 171 stats = new std::map<kgram, std::map<int, token_data*>* >();
168 for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) 172 for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
169 { 173 {
170 kgram klist = it->first; 174 kgram klist = it->first;
171 map<string, token_data*>* probtable = it->second; 175 std::map<std::string, token_data*>* probtable = it->second;
172 map<int, token_data*>* distribution = new map<int, token_data*>(); 176 std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
173 int max = 0; 177 int max = 0;
174 178
175 for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) 179 for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
176 { 180 {
177 max += kt->second->all; 181 max += kt->second->all;
178 182
@@ -187,17 +191,17 @@ void printKgram(kgram k)
187{ 191{
188 for (kgram::iterator it = k.begin(); it != k.end(); it++) 192 for (kgram::iterator it = k.begin(); it != k.end(); it++)
189 { 193 {
190 cout << *it << " "; 194 std::cout << *it << " ";
191 } 195 }
192} 196}
193 197
194// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 198// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
195vector<string> kgramstats::randomSentence(int n) 199std::vector<std::string> kgramstats::randomSentence(int n)
196{ 200{
197 vector<string> result; 201 std::vector<std::string> result;
198 kgram newKgram(1, "."); 202 kgram newKgram(1, ".");
199 kgram commaKgram(1, ","); 203 kgram commaKgram(1, ",");
200 list<string> cur = newKgram; 204 std::list<std::string> cur = newKgram;
201 int cuts = 0; 205 int cuts = 0;
202 206
203 for (int i=0; i<n; i++) 207 for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
221 cuts++; 225 cuts++;
222 } 226 }
223 227
224 map<int, token_data*> distribution = *(*stats)[cur]; 228 std::map<int, token_data*> distribution = *(*stats)[cur];
225 int max = distribution.rbegin()->first; 229 int max = distribution.rbegin()->first;
226 int r = rand() % max; 230 int r = rand() % max;
227 token_data* next = distribution.upper_bound(r)->second; 231 token_data* next = distribution.upper_bound(r)->second;
228 232
229 string nextToken(*(next->token)); 233 std::string nextToken(*(next->token));
230 int casing = rand() % next->all; 234 int casing = rand() % next->all;
231 int period = rand() % next->all; 235 int period = rand() % next->all;
232 int startparen = rand() % next->all; 236 int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
236 int comma = rand() % next->all; 240 int comma = rand() % next->all;
237 if (casing < next->uppercase) 241 if (casing < next->uppercase)
238 { 242 {
239 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 243 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
240 } else if ((casing - next->uppercase) < next->titlecase) 244 } else if ((casing - next->uppercase) < next->titlecase)
241 { 245 {
242 nextToken[0] = toupper(nextToken[0]); 246 nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
246 { 250 {
247 nextToken[0] = toupper(nextToken[0]); 251 nextToken[0] = toupper(nextToken[0]);
248 } 252 }
249 /* 253
250 if (startquote < next->startquote) 254 bool mess = (rand() % 100) == 0;
251 { 255 if (mess)
252 nextToken = "\"" + nextToken;
253 } else if (startparen < next->startparen)
254 { 256 {
255 nextToken = "(" + nextToken; 257 nextToken = mstats.alternate(nextToken);
256 } 258
257 259 if (startquote < next->startquote)
258 if (period < next->period)
259 {
260 if (endquote < next->endquote)
261 { 260 {
262 nextToken += "\""; 261 nextToken = "\"" + nextToken;
263 } else if (endparen < next->endparen) 262 } else if (startparen < next->startparen)
264 { 263 {
265 nextToken += ")"; 264 nextToken = "(" + nextToken;
266 } 265 }
266
267 if (period < next->period)
268 {
269 if (endquote < next->endquote)
270 {
271 nextToken += "\"";
272 } else if (endparen < next->endparen)
273 {
274 nextToken += ")";
275 }
267 276
268 int type = rand() % 6; 277 int type = rand() % 6;
269 278
270 if (type < 3) 279 if (type < 3)
271 { 280 {
272 nextToken += "."; 281 nextToken += ".";
273 } else if (type < 5) 282 } else if (type < 5)
274 { 283 {
275 nextToken += "!"; 284 nextToken += "!";
276 } else { 285 } else {
277 nextToken += "?"; 286 nextToken += "?";
278 } 287 }
279 } else if (comma < next->comma) 288 } else if (comma < next->comma)
280 {
281 if (endquote < next->endquote)
282 {
283 nextToken += "\"";
284 } else if (endparen < next->endparen)
285 { 289 {
286 nextToken += ")"; 290 if (endquote < next->endquote)
287 } 291 {
292 nextToken += "\"";
293 } else if (endparen < next->endparen)
294 {
295 nextToken += ")";
296 }
288 297
289 nextToken += ","; 298 nextToken += ",";
299 }
290 } 300 }
291*/ 301
292 if (cur.size() == maxK) 302 if (cur.size() == maxK)
293 { 303 {
294 cur.pop_front(); 304 cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
297 /* DEBUG */ 307 /* DEBUG */
298 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 308 for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
299 { 309 {
300 cout << *it << " "; 310 std::cout << *it << " ";
301 } 311 }
302 312
303 cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; 313 std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
314
315 if (mess)
316 {
317 std::cout << " mala " << *(next->token);
318 }
319
320 std::cout << std::endl;
304 321
305 if ((cur == newKgram) || (cur == commaKgram)) 322 if ((cur == newKgram) || (cur == commaKgram))
306 { 323 {
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
314 { 331 {
315 cur = commaKgram; 332 cur = commaKgram;
316 } else { 333 } else {
317 cur.push_back(*(next->token)); 334 //if (mess && (rand() % 2 == 0))
335 if (false)
336 {
337 // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
338 cur.clear();
339 cur.push_back(nextToken);
340 } else {
341 cur.push_back(*(next->token));
342 }
318 } 343 }
319 344
320 result.push_back(nextToken); 345 result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
330 355
331std::string canonize(std::string f) 356std::string canonize(std::string f)
332{ 357{
333 string canonical(f); 358 std::string canonical(f);
334 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 359 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
335 360
336 string result; 361 std::string result;
337 remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 362 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
338 363
339 return canonical; 364 return canonical;
340} 365}
diff --git a/kgramstats.h b/kgramstats.h index 059eb05..b01dece 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,19 +2,18 @@
2#include <map> 2#include <map>
3#include <list> 3#include <list>
4#include <vector> 4#include <vector>
5 5#include "malaprop.h"
6using namespace::std;
7 6
8#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
9#define KGRAMSTATS_H 8#define KGRAMSTATS_H
10 9
11typedef list<string> kgram; 10typedef std::list<std::string> kgram;
12 11
13class kgramstats 12class kgramstats
14{ 13{
15public: 14public:
16 kgramstats(string corpus, int maxK); 15 kgramstats(std::string corpus, int maxK);
17 vector<string> randomSentence(int n); 16 std::vector<std::string> randomSentence(int n);
18 17
19private: 18private:
20 typedef struct 19 typedef struct
@@ -28,13 +27,13 @@ private:
28 int startparen; 27 int startparen;
29 int endparen; 28 int endparen;
30 int comma; 29 int comma;
31 string* token; 30 std::string* token;
32 } token_data; 31 } token_data;
33 int maxK; 32 int maxK;
34 map<kgram, map<int, token_data*>* >* stats; 33 std::map<kgram, std::map<int, token_data*>* >* stats;
34 malaprop mstats;
35}; 35};
36 36
37void printKgram(kgram k); 37void printKgram(kgram k);
38std::string canonize(std::string f);
39 38
40#endif \ No newline at end of file 39#endif \ No newline at end of file
diff --git a/malaprop.cpp b/malaprop.cpp new file mode 100644 index 0000000..bfea579 --- /dev/null +++ b/malaprop.cpp
@@ -0,0 +1,127 @@
1#include "malaprop.h"
2#include <cstdlib>
3#include <iostream>
4
5bool removeIfM(char c)
6{
7 return !isalpha(c);
8}
9
10char soundID(char l)
11{
12 switch (l)
13 {
14 case 'b':
15 case 'f':
16 case 'p':
17 case 'v':
18 return '1';
19
20 case 'c':
21 case 'g':
22 case 'j':
23 case 'k':
24 case 'q':
25 case 's':
26 case 'x':
27 case 'z':
28 return '2';
29
30 case 'd':
31 case 't':
32 return '3';
33
34 case 'l':
35 return '4';
36
37 case 'm':
38 case 'n':
39 return '5';
40
41 case 'r':
42 return '6';
43 }
44
45 return l;
46}
47
48std::string canonizetwo(std::string f)
49{
50 std::string canonical(f);
51 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
52
53 std::string result;
54 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIfM);
55
56 return result;
57}
58
59malaprop::soundex malaprop::soundify(std::string f)
60{
61 std::string result(canonizetwo(f));
62
63 soundex ex;
64 ex.prefix = result[0];
65
66 std::string output;
67
68 for (int i = 1; i<result.length(); i++)
69 {
70 int c = soundID(result[i]);
71 if (
72 (isdigit(c)) // Not a vowel
73 && (c != soundID(result[i-1])) // Not the same as the previous character
74 && ((i < 2) || ((result[i-1] = 'h' || result[i-1] == 'w') && (c != soundID(result[i-2])))) // Not same as before h/w
75 )
76 {
77 output += c;
78 }
79 }
80
81 output.resize(3, '0');
82 ex.code = atoi(output.c_str());
83
84 return ex;
85}
86
87void malaprop::addWord(std::string word)
88{
89 soundex ex = soundify(word);
90
91 dict[ex].insert(canonizetwo(word));
92}
93
94void malaprop::stats()
95{
96 for (std::map<soundex, std::set<std::string> >::iterator it = dict.begin(); it != dict.end(); it++)
97 {
98 printf("%c%03d (%d): ", it->first.prefix, it->first.code, it->second.size());
99
100 for (std::set<std::string>::iterator jt = it->second.begin(); jt != it->second.end(); jt++)
101 {
102 std::cout << *jt << ", ";
103 }
104
105 std::cout << std::endl;
106 }
107
108 exit(0);
109}
110
111std::string malaprop::alternate(std::string word)
112{
113 soundex ex = soundify(word);
114 std::set<std::string>& opts = dict[ex];
115 int opt = rand() % opts.size();
116 for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++)
117 {
118 if (opt == 0)
119 {
120 return *it;
121 }
122
123 opt--;
124 }
125
126 return word;
127}
diff --git a/malaprop.h b/malaprop.h new file mode 100644 index 0000000..91a18eb --- /dev/null +++ b/malaprop.h
@@ -0,0 +1,31 @@
1#ifndef MALAPROP_H_8F382336
2#define MALAPROP_H_8F382336
3
4#include <string>
5#include <map>
6#include <set>
7
8class malaprop
9{
10public:
11 void addWord(std::string word);
12 void stats();
13 std::string alternate(std::string word);
14
15private:
16 struct soundex {
17 char prefix;
18 int code;
19
20 bool operator<(const soundex& other) const
21 {
22 return (prefix < other.prefix) || (code < other.code);
23 }
24 };
25
26 std::map<soundex, std::set<std::string> > dict;
27
28 soundex soundify(std::string l);
29};
30
31#endif /* end of include guard: MALAPROP_H_8F382336 */