about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
commit9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree9afb52740fe4f618105d014a816df26b36ed83f6
parent0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
downloadrawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
Rewrote quite a bit of kgramstats
The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses).

The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
-rw-r--r--ebooks.cpp316
-rw-r--r--gen.cpp98
-rw-r--r--kgramstats.cpp442
-rw-r--r--kgramstats.h84
-rw-r--r--malaprop.cpp5
5 files changed, 444 insertions, 501 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 27591f4..a24bd8d 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -14,174 +14,168 @@
14 14
15int main(int argc, char** args) 15int main(int argc, char** args)
16{ 16{
17 srand(time(NULL)); 17 srand(time(NULL));
18 18
19 YAML::Node config = YAML::LoadFile("config.yml"); 19 YAML::Node config = YAML::LoadFile("config.yml");
20 int delay = config["delay"].as<int>(); 20 int delay = config["delay"].as<int>();
21 21
22 std::ifstream infile(config["corpus"].as<std::string>().c_str()); 22 std::ifstream infile(config["corpus"].as<std::string>().c_str());
23 std::string corpus; 23 std::string corpus;
24 std::string line; 24 std::string line;
25 while (getline(infile, line)) 25 while (getline(infile, line))
26 { 26 {
27 corpus += line + "\n "; 27 corpus += line + "\n ";
28 } 28 }
29 29
30 std::cout << "Preprocessing corpus..." << std::endl; 30 std::cout << "Preprocessing corpus..." << std::endl;
31 kgramstats* stats = new kgramstats(corpus, 4); 31 kgramstats* stats = new kgramstats(corpus, 4);
32 32
33 std::cout << "Preprocessing freevars..." << std::endl; 33 std::cout << "Preprocessing freevars..." << std::endl;
34 freevars* vars = new freevars(); 34 freevars* vars = new freevars();
35 vars->addVar("name", "names.txt"); 35 vars->addVar("name", "names.txt");
36 vars->addVar("noun", "nouns.txt"); 36 vars->addVar("noun", "nouns.txt");
37 37
38 std::cout << "Generating..." << std::endl; 38 std::cout << "Generating..." << std::endl;
39 for (;;) 39 for (;;)
40 { 40 {
41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); 41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
42 std::string hi; 42 std::string hi;
43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
44 {
45 hi += vars->parse(*it) + " ";
46 }
47
48 size_t firstperiod = hi.find_first_of(".!?");
49 if (firstperiod != std::string::npos)
50 { 44 {
51 hi = hi.substr(firstperiod+2); 45 hi += vars->parse(*it) + " ";
52 } 46 }
53 47
54 hi.resize(140); 48 hi.resize(140);
55 49
56 size_t lastperiod = hi.find_last_of(".!?"); 50 size_t lastperiod = hi.find_last_of(".!?,");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 51 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 { 52 {
59 hi = hi.substr(0, lastperiod+1); 53 hi = hi.substr(0, lastperiod+1);
60 } 54 }
61 55
62 twitCurl twitterObj; 56 twitCurl twitterObj;
63 std::string tmpStr, tmpStr2; 57 std::string tmpStr, tmpStr2;
64 std::string replyMsg; 58 std::string replyMsg;
65 char tmpBuf[1024]; 59 char tmpBuf[1024];
66 std::string username(config["username"].as<std::string>()); 60 std::string username(config["username"].as<std::string>());
67 std::string password(config["password"].as<std::string>()); 61 std::string password(config["password"].as<std::string>());
68 62
69 /* Set twitter username and password */ 63 /* Set twitter username and password */
70 twitterObj.setTwitterUsername(username); 64 twitterObj.setTwitterUsername(username);
71 twitterObj.setTwitterPassword(password); 65 twitterObj.setTwitterPassword(password);
72 66
73 /* OAuth flow begins */ 67 /* OAuth flow begins */
74 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ 68 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
75 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() ); 69 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
76 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() ); 70 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
77 71
78 /* Step 1: Check if we alredy have OAuth access token from a previous run */ 72 /* Step 1: Check if we alredy have OAuth access token from a previous run */
79 std::string myOAuthAccessTokenKey(""); 73 std::string myOAuthAccessTokenKey("");
80 std::string myOAuthAccessTokenSecret(""); 74 std::string myOAuthAccessTokenSecret("");
81 std::ifstream oAuthTokenKeyIn; 75 std::ifstream oAuthTokenKeyIn;
82 std::ifstream oAuthTokenSecretIn; 76 std::ifstream oAuthTokenSecretIn;
83 77
84 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); 78 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
85 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); 79 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
86 80
87 memset( tmpBuf, 0, 1024 ); 81 memset( tmpBuf, 0, 1024 );
88 oAuthTokenKeyIn >> tmpBuf; 82 oAuthTokenKeyIn >> tmpBuf;
89 myOAuthAccessTokenKey = tmpBuf; 83 myOAuthAccessTokenKey = tmpBuf;
90 84
91 memset( tmpBuf, 0, 1024 ); 85 memset( tmpBuf, 0, 1024 );
92 oAuthTokenSecretIn >> tmpBuf; 86 oAuthTokenSecretIn >> tmpBuf;
93 myOAuthAccessTokenSecret = tmpBuf; 87 myOAuthAccessTokenSecret = tmpBuf;
94 88
95 oAuthTokenKeyIn.close(); 89 oAuthTokenKeyIn.close();
96 oAuthTokenSecretIn.close(); 90 oAuthTokenSecretIn.close();
97 91
98 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) 92 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
99 { 93 {
100 /* If we already have these keys, then no need to go through auth again */ 94 /* If we already have these keys, then no need to go through auth again */
101 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); 95 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
102 96
103 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); 97 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
104 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); 98 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
105 } 99 }
106 else 100 else
107 { 101 {
108 /* Step 2: Get request token key and secret */ 102 /* Step 2: Get request token key and secret */
109 std::string authUrl; 103 std::string authUrl;
110 twitterObj.oAuthRequestToken( authUrl ); 104 twitterObj.oAuthRequestToken( authUrl );
111 105
112 /* Step 3: Get PIN */ 106 /* Step 3: Get PIN */
113 memset( tmpBuf, 0, 1024 ); 107 memset( tmpBuf, 0, 1024 );
114 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); 108 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
115 gets( tmpBuf ); 109 gets( tmpBuf );
116 tmpStr = tmpBuf; 110 tmpStr = tmpBuf;
117 if( std::string::npos != tmpStr.find( "1" ) ) 111 if( std::string::npos != tmpStr.find( "1" ) )
118 { 112 {
119 /* Ask user to visit twitter.com auth page and get PIN */ 113 /* Ask user to visit twitter.com auth page and get PIN */
120 memset( tmpBuf, 0, 1024 ); 114 memset( tmpBuf, 0, 1024 );
121 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); 115 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
122 printf( "\nEnter the PIN provided by twitter: " ); 116 printf( "\nEnter the PIN provided by twitter: " );
123 gets( tmpBuf ); 117 gets( tmpBuf );
124 tmpStr = tmpBuf; 118 tmpStr = tmpBuf;
125 twitterObj.getOAuth().setOAuthPin( tmpStr ); 119 twitterObj.getOAuth().setOAuthPin( tmpStr );
126 } 120 }
127 else 121 else
128 { 122 {
129 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ 123 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
130 twitterObj.oAuthHandlePIN( authUrl ); 124 twitterObj.oAuthHandlePIN( authUrl );
131 } 125 }
132 126
133 /* Step 4: Exchange request token with access token */ 127 /* Step 4: Exchange request token with access token */
134 twitterObj.oAuthAccessToken(); 128 twitterObj.oAuthAccessToken();
135 129
136 /* Step 5: Now, save this access token key and secret for future use without PIN */ 130 /* Step 5: Now, save this access token key and secret for future use without PIN */
137 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); 131 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
138 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); 132 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
139 133
140 /* Step 6: Save these keys in a file or wherever */ 134 /* Step 6: Save these keys in a file or wherever */
141 std::ofstream oAuthTokenKeyOut; 135 std::ofstream oAuthTokenKeyOut;
142 std::ofstream oAuthTokenSecretOut; 136 std::ofstream oAuthTokenSecretOut;
143 137
144 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); 138 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
145 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); 139 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
146 140
147 oAuthTokenKeyOut.clear(); 141 oAuthTokenKeyOut.clear();
148 oAuthTokenSecretOut.clear(); 142 oAuthTokenSecretOut.clear();
149 143
150 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); 144 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
151 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); 145 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
152 146
153 oAuthTokenKeyOut.close(); 147 oAuthTokenKeyOut.close();
154 oAuthTokenSecretOut.close(); 148 oAuthTokenSecretOut.close();
155 } 149 }
156 /* OAuth flow ends */ 150 /* OAuth flow ends */
157 151
158 /* Account credentials verification */ 152 /* Account credentials verification */
159 if( twitterObj.accountVerifyCredGet() ) 153 if( twitterObj.accountVerifyCredGet() )
160 { 154 {
161 twitterObj.getLastWebResponse( replyMsg ); 155 twitterObj.getLastWebResponse( replyMsg );
162 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); 156 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
163 } 157 }
164 else 158 else
165 { 159 {
166 twitterObj.getLastCurlError( replyMsg ); 160 twitterObj.getLastCurlError( replyMsg );
167 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); 161 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
168 } 162 }
169 163
170 /* Post a new status message */ 164 /* Post a new status message */
171 replyMsg = ""; 165 replyMsg = "";
172 if( twitterObj.statusUpdate( hi ) ) 166 if( twitterObj.statusUpdate( hi ) )
173 { 167 {
174 twitterObj.getLastWebResponse( replyMsg ); 168 twitterObj.getLastWebResponse( replyMsg );
175 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); 169 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
176 } 170 }
177 else 171 else
178 { 172 {
179 twitterObj.getLastCurlError( replyMsg ); 173 twitterObj.getLastCurlError( replyMsg );
180 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); 174 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
181 } 175 }
182 176
183 sleep(rand() % delay); 177 sleep(rand() % delay);
184 } 178 }
185 179
186 return 0; 180 return 0;
187} 181}
diff --git a/gen.cpp b/gen.cpp index 7e47d45..400c0a5 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -11,72 +11,66 @@
11 11
12int main(int argc, char** args) 12int main(int argc, char** args)
13{ 13{
14 srand(time(NULL)); 14 srand(time(NULL));
15 15
16 if (argc == 1) 16 if (argc == 1)
17 { 17 {
18 std::cout << "rawr-gen, version 1.0" << std::endl; 18 std::cout << "rawr-gen, version 1.0" << std::endl;
19 std::cout << "Usage: rawr-gen corpus-file" << std::endl; 19 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
20 std::cout << " where 'corpus-file' is the path to your input" << std::endl; 20 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
21 21
22 return 0; 22 return 0;
23 } 23 }
24 24
25 std::ifstream infile(args[1]); 25 std::ifstream infile(args[1]);
26 if (!infile) 26 if (!infile)
27 { 27 {
28 std::cout << "rawr-gen, version 1.0" << std::endl; 28 std::cout << "rawr-gen, version 1.0" << std::endl;
29 std::cout << "Usage: rawr-gen corpus-file" << std::endl; 29 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
30 std::cout << " where 'corpus-file' is the path to your input" << std::endl; 30 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
31 std::cout << std::endl; 31 std::cout << std::endl;
32 std::cout << "The file you specified does not exist." << std::endl; 32 std::cout << "The file you specified does not exist." << std::endl;
33 33
34 return 0; 34 return 0;
35 } 35 }
36 36
37 std::string corpus; 37 std::string corpus;
38 std::string line; 38 std::string line;
39 while (getline(infile, line)) 39 while (getline(infile, line))
40 { 40 {
41 corpus += line + "\n "; 41 corpus += line + "\n ";
42 } 42 }
43 43
44 std::cout << "Preprocessing corpus..." << std::endl; 44 std::cout << "Preprocessing corpus..." << std::endl;
45 kgramstats* stats = new kgramstats(corpus, 4); 45 kgramstats* stats = new kgramstats(corpus, 4);
46 46
47 std::cout << "Preprocessing freevars..." << std::endl; 47 std::cout << "Preprocessing freevars..." << std::endl;
48 freevars* vars = new freevars(); 48 freevars* vars = new freevars();
49 vars->addVar("name", "names.txt"); 49 vars->addVar("name", "names.txt");
50 vars->addVar("noun", "nouns.txt"); 50 vars->addVar("noun", "nouns.txt");
51 51
52 std::cout << "Generating..." << std::endl; 52 std::cout << "Generating..." << std::endl;
53 for (;;) 53 for (;;)
54 { 54 {
55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45); 55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
56 std::string hi; 56 std::string hi;
57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58 {
59 hi += vars->parse(*it) + " ";
60 }
61
62 size_t firstperiod = hi.find_first_of(".!?");
63 if (firstperiod != std::string::npos)
64 { 58 {
65 hi = hi.substr(firstperiod+2); 59 hi += vars->parse(*it) + " ";
66 } 60 }
67 61
68 hi.resize(140); 62 hi.resize(140);
69 63
70 size_t lastperiod = hi.find_last_of(".!?"); 64 size_t lastperiod = hi.find_last_of(".!?,");
71 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 65 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
72 { 66 {
73 hi = hi.substr(0, lastperiod+1); 67 hi = hi.substr(0, lastperiod+1);
74 } 68 }
75 69
76 std::cout << hi << std::endl; 70 std::cout << hi << std::endl;
77 71
78 getc(stdin); 72 getc(stdin);
79 } 73 }
80 74
81 return 0; 75 return 0;
82} 76}
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0ec68a..c88d83c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -5,237 +5,176 @@
5#include <algorithm> 5#include <algorithm>
6#include "malaprop.h" 6#include "malaprop.h"
7 7
8query wildcardQuery(querytype_sentence);
9
8std::string canonize(std::string f); 10std::string canonize(std::string f);
9 11
10// runs in O(t^2) time where t is the number of tokens in the input corpus 12// runs in O(t^2) time where t is the number of tokens in the input corpus
11// We consider maxK to be fairly constant 13// We consider maxK to be fairly constant
12kgramstats::kgramstats(std::string corpus, int maxK) 14kgramstats::kgramstats(std::string corpus, int maxK)
13{ 15{
14 this->maxK = maxK; 16 this->maxK = maxK;
15 17
16 std::vector<std::string> tokens; 18 std::vector<std::string> tokens;
17 size_t start = 0; 19 size_t start = 0;
18 int end = 0; 20 int end = 0;
19 21
20 while (end != std::string::npos) 22 while (end != std::string::npos)
21 { 23 {
22 end = corpus.find(" ", start); 24 end = corpus.find(" ", start);
23 25
24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 26 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25 if (token[token.length()-1] == '\n') 27 if (token[token.length()-1] == '\n')
26 { 28 {
27 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) 29 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
28 { 30 {
29 token.insert(token.length()-1, "."); 31 token.insert(token.length()-1, ".");
30 } 32 }
31 33
32 token.resize(token.length()-1); 34 token.resize(token.length()-1);
33 } 35 }
34 36
35 if (token.compare("") && token.compare(".")) 37 if (token.compare("") && token.compare("."))
36 { 38 {
37 mstats.addWord(token); 39 mstats.addWord(token);
38 tokens.push_back(token); 40 tokens.push_back(token);
39 } 41 }
40 42
41 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 43 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
42 } 44 }
43 45
44 std::map<kgram, std::map<std::string, token_data*>* > tstats; 46 std::map<kgram, std::map<token, token_data> > tstats;
45 bool newSentence = true; 47 std::map<token, std::map<termstats, int> > tendings;
46 bool newClause = false; 48 for (int k=1; k<maxK; k++)
47 for (int k=0; k<maxK; k++) 49 {
48 { 50 for (int i=0; i<(tokens.size() - k); i++)
49 for (int i=0; i<(tokens.size() - k); i++) 51 {
50 { 52 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
51 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 53 kgram prefix;
52 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
53 std::string f = tokens[i+k];
54
55
56
57 std::string canonical = canonize(f);
58
59 if (tstats[seq] == NULL)
60 {
61 tstats[seq] = new std::map<std::string, token_data*>();
62 }
63
64 if ((*tstats[seq])[canonical] == NULL)
65 {
66 (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
67 }
68
69 token_data* td = tstats[seq]->at(canonical);
70 td->token = new std::string(canonical);
71 td->all++;
72 54
73 /*if (newSentence) 55 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
74 { 56 {
75 kgram newKgram(1, "."); 57 token word(canonize(*it));
76 if (tstats[newKgram] == NULL) 58
59 if (it->find_first_of(".?!,") != std::string::npos)
77 { 60 {
78 tstats[newKgram] = new std::map<std::string, token_data*>(); 61 word.terminating = true;
79 } 62 }
80 63
81 (*tstats[newKgram])[canonical] = td; 64 prefix.push_back(word);
82
83 newSentence = false;
84 } 65 }
85 66
86 if (newClause) 67 std::string f = tokens[i+k];
68 std::string canonical = canonize(f);
69
70 token word(canonical);
71 if (f.find_first_of(".?!,") != std::string::npos)
87 { 72 {
88 kgram commaKgram(1, ","); 73 word.terminating = true;
89 if (tstats[commaKgram] == NULL)
90 {
91 tstats[commaKgram] = new std::map<std::string, token_data*>();
92 }
93 74
94 (*tstats[commaKgram])[canonical] = td; 75 char terminator = f[f.find_last_of(".?!,")];
76 int occurrences = std::count(f.begin(), f.end(), terminator);
95 77
96 newClause = false; 78 tendings[word][termstats(terminator, occurrences)]++;
97 }
98
99 if ((f.length() > 0) && (f[f.length()-1] == '\n'))
100 {
101 td->period++;
102 newSentence = true;
103 f.resize(f.length()-1);
104 } 79 }
105 80
106 if (f.length() > 0) 81 token_data& td = tstats[prefix][word];
82 td.word = word;
83 td.all++;
84
85 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
107 { 86 {
108 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) 87 td.uppercase++;
109 { 88 } else if (isupper(f[0]))
110 if (!newSentence) 89 {
111 { 90 td.titlecase++;
112 td->period++;
113 newSentence = true;
114 }
115
116 f.resize(f.length()-1);
117 } else if (f[f.length()-1] == ',')
118 {
119 if (!newSentence)
120 {
121 td->comma++;
122 newClause = true;
123 }
124
125 f.resize(f.length()-1);
126 }
127 } 91 }
128 92
129 if (f.length() > 0) 93 if (prefix.front().word.terminating)
130 { 94 {
131 if (f[0] == '"') 95 prefix.front() = wildcardQuery;
132 {
133 td->startquote++;
134 }
135 96
136 if (f[0] == '(') 97 token_data& td2 = tstats[prefix][word];
98 td2.word = word;
99 td2.all++;
100
101 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
137 { 102 {
138 td->startparen++; 103 td2.uppercase++;
139 } 104 } else if (isupper(f[0]))
140
141 if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
142 { 105 {
143 if (f[f.length()-1] == '"') 106 td2.titlecase++;
144 {
145 td->endquote++;
146 } else if (f[f.length()-1] == ')')
147 {
148 td->endparen++;
149 }
150
151 f.resize(f.length()-1);
152
153 if (f.length() > 0)
154 {
155 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
156 {
157 if (!newSentence)
158 {
159 td->period++;
160 newSentence = true;
161 }
162 } else if (f[f.length()-1] == ',')
163 {
164 if (!newSentence && !newClause)
165 {
166 td->comma++;
167 newClause = true;
168 }
169 }
170 }
171 }
172 }*/
173
174 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
175 {
176 td->uppercase++;
177 } else if (isupper(f[0]))
178 {
179 td->titlecase++;
180 }
181
182 /*if (k != 0)
183 {
184 if (newSentence)
185 {
186 i += k;
187 } 107 }
188 108 }
189 newSentence = false; 109 }
190 newClause = false; 110 }
191 }*/
192 }
193 }
194 111
195 stats = new std::map<kgram, std::map<int, token_data*>* >(); 112 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
196 for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) 113 {
197 { 114 kgram klist = it->first;
198 kgram klist = it->first; 115 std::map<token, token_data>& probtable = it->second;
199 std::map<std::string, token_data*>* probtable = it->second; 116 std::map<int, token_data>& distribution = stats[klist];
200 std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); 117 int max = 0;
201 int max = 0;
202 118
203 for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) 119 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
204 { 120 {
205 max += kt->second->all; 121 max += kt->second.all;
206 122
207 (*distribution)[max] = kt->second; 123 distribution[max] = kt->second;
208 } 124 }
209 125 }
210 (*stats)[klist] = distribution; 126
211 } 127 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
128 {
129 token word = it->first;
130 std::map<termstats, int>& probtable = it->second;
131 std::map<int, termstats>& distribution = endings[word];
132 int max = 0;
133
134 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
135 {
136 max += kt->second;
137
138 distribution[max] = kt->first;
139 }
140 }
212} 141}
213 142
214void printKgram(kgram k) 143void printKgram(kgram k)
215{ 144{
216 for (kgram::iterator it = k.begin(); it != k.end(); it++) 145 for (kgram::iterator it = k.begin(); it != k.end(); it++)
217 { 146 {
218 std::cout << *it << " "; 147 query& q = *it;
219 } 148 if (q.type == querytype_sentence)
149 {
150 std::cout << "#.# ";
151 } else if (q.type == querytype_literal)
152 {
153 if (q.word.terminating)
154 {
155 std::cout << q.word.canon << ". ";
156 } else {
157 std::cout << q.word.canon << " ";
158 }
159 }
160 }
220} 161}
221 162
222// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 163// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
223std::vector<std::string> kgramstats::randomSentence(int n) 164std::vector<std::string> kgramstats::randomSentence(int n)
224{ 165{
225 std::vector<std::string> result; 166 std::vector<std::string> result;
226 kgram newKgram(1, "."); 167 kgram cur(1, wildcardQuery);
227 kgram commaKgram(1, ",");
228 std::list<std::string> cur;
229 int cuts = 0; 168 int cuts = 0;
230 169
231 for (int i=0; i<n; i++) 170 for (int i=0; i<n; i++)
232 { 171 {
233 if (cur.size() == maxK) 172 if (cur.size() == maxK)
234 { 173 {
235 cur.pop_front(); 174 cur.pop_front();
236 } 175 }
237 176
238 if ((cur.size() > 0) && (cur != newKgram)) 177 if (cur.size() > 0)
239 { 178 {
240 if (rand() % (maxK - cur.size() + 1) == 0) 179 if (rand() % (maxK - cur.size() + 1) == 0)
241 { 180 {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
253 192
254 cuts++; 193 cuts++;
255 } 194 }
195
196 // Gotta circumvent the last line of the input corpus
197 // https://twitter.com/starla4444/status/684222271339237376
198 if (stats.count(cur) == 0)
199 {
200 cur = kgram(1, wildcardQuery);
201 }
256 202
257 std::map<int, token_data*> distribution = *(*stats)[cur]; 203 std::map<int, token_data>& distribution = stats[cur];
258 int max = distribution.rbegin()->first; 204 int max = distribution.rbegin()->first;
259 int r = rand() % max; 205 int r = rand() % max;
260 token_data* next = distribution.upper_bound(r)->second; 206 token_data& next = distribution.upper_bound(r)->second;
261 207 std::string nextToken(next.word.canon);
262 std::string nextToken(*(next->token));
263 int casing = rand() % next->all;
264 /*int period = rand() % next->all;
265 int startparen = rand() % next->all;
266 int endparen = rand() % next->all;
267 int startquote = rand() % next->all;
268 int endquote = rand() % next->all;
269 int comma = rand() % next->all;*/
270 208
271 bool mess = (rand() % 100) == 0; 209 bool mess = (rand() % 100) == 0;
272 if (mess) 210 if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
274 nextToken = mstats.alternate(nextToken); 212 nextToken = mstats.alternate(nextToken);
275 } 213 }
276 214
277 if (casing < next->uppercase) 215 // Determine the casing of the next token. We randomly make the token all
278 { 216 // caps based on the markov chain. Otherwise, we check if the previous
279 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 217 // token is the end of a sentence (terminating token or a wildcard query).
280 } 218 int casing = rand() % next.all;
281 219 if (casing < next.uppercase)
282 if ((cur == newKgram) && (rand() % 15 > 0)) 220 {
221 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
222 } else if ((((cur.rbegin()->type == querytype_sentence)
223 || ((cur.rbegin()->type == querytype_literal)
224 && (cur.rbegin()->word.terminating)))
225 && (rand() % 2 > 0))
226 || (casing - next.uppercase < next.titlecase))
283 { 227 {
284 nextToken[0] = toupper(nextToken[0]); 228 nextToken[0] = toupper(nextToken[0]);
285 } 229 }
286 230
287 /*if (startquote < next->startquote) 231 if (next.word.terminating)
288 {
289 nextToken = "\"" + nextToken;
290 } else if (startparen < next->startparen)
291 { 232 {
292 nextToken = "(" + nextToken; 233 std::map<int, termstats>& ending = endings[next.word];
234 int emax = ending.rbegin()->first;
235 int er = rand() % emax;
236 termstats& nextend = ending.upper_bound(er)->second;
237
238 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
293 } 239 }
294
295 if (period < next->period)
296 {
297 if (endquote < next->endquote)
298 {
299 nextToken += "\"";
300 } else if (endparen < next->endparen)
301 {
302 nextToken += ")";
303 }
304
305 int type = rand() % 6;
306
307 if (type < 3)
308 {
309 nextToken += ".";
310 } else if (type < 5)
311 {
312 nextToken += "!";
313 } else {
314 nextToken += "?";
315 }
316 } else if (comma < next->comma)
317 {
318 if (endquote < next->endquote)
319 {
320 nextToken += "\"";
321 } else if (endparen < next->endparen)
322 {
323 nextToken += ")";
324 }
325
326 nextToken += ",";
327 }*/
328 240
329 /* DEBUG */ 241 /* DEBUG */
330 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 242 printKgram(cur);
331 {
332 std::cout << *it << " ";
333 }
334 243
335 std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; 244 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
336 245
337 if (mess) 246 if (mess)
338 { 247 {
339 std::cout << " mala " << *(next->token); 248 std::cout << " mala " << next.word.canon;
340 } 249 }
341 250
342 std::cout << std::endl; 251 std::cout << std::endl;
343 252
344 /*if ((cur == newKgram) || (cur == commaKgram)) 253 cur.push_back(next.word);
345 {
346 cur.pop_front();
347 }
348
349 if (period < next->period)// && ((rand() % 3) != 0))
350 {
351 cur = newKgram;
352 } else if ((comma < next->comma) && ((rand() % 3) == 0))
353 {
354 cur = commaKgram;
355 } else {*/
356 //if (mess && (rand() % 2 == 0))
357 if (false)
358 {
359 // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
360 cur.clear();
361 cur.push_back(nextToken);
362 } else {
363 cur.push_back(*(next->token));
364 }
365 //}
366 254
367 result.push_back(nextToken); 255 result.push_back(nextToken);
368 } 256 }
369 257
370 return result; 258 return result;
371} 259}
372 260
373bool removeIf(char c) 261bool removeIf(char c)
374{ 262{
375 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); 263 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
376} 264}
377 265
378std::string canonize(std::string f) 266std::string canonize(std::string f)
379{ 267{
380 std::string canonical(f); 268 std::string canonical(f);
381 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 269 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
382 270
383 std::string result; 271 std::string result;
384 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 272 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
385 273
386 return canonical; 274 return result;
387} 275}
diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,7 +7,71 @@
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10typedef std::list<std::string> kgram; 10struct token {
11 std::string canon;
12 bool terminating;
13
14 token(std::string canon) : canon(canon), terminating(false) {}
15
16 bool operator<(const token& other) const
17 {
18 if (canon == other.canon)
19 {
20 return !terminating && other.terminating;
21 } else {
22 return canon < other.canon;
23 }
24 }
25};
26
27enum querytype {
28 querytype_literal,
29 querytype_sentence
30};
31
32struct query {
33 querytype type;
34 token word;
35
36 query(token word) : word(word), type(querytype_literal) {}
37
38 query(querytype type) : word(""), type(type) {}
39
40 bool operator<(const query& other) const
41 {
42 if (type == other.type)
43 {
44 return word < other.word;
45 } else {
46 return type < other.type;
47 }
48 }
49};
50
51typedef std::list<query> kgram;
52
53struct termstats {
54 char terminator;
55 int occurrences;
56
57 termstats() : terminator('.'), occurrences(1) {}
58
59 termstats(char terminator, int occurrences)
60 {
61 this->terminator = terminator;
62 this->occurrences = occurrences;
63 }
64
65 bool operator<(const termstats& other) const
66 {
67 if (terminator == other.terminator)
68 {
69 return occurrences < other.occurrences;
70 } else {
71 return terminator < other.terminator;
72 }
73 }
74};
11 75
12class kgramstats 76class kgramstats
13{ 77{
@@ -16,22 +80,20 @@ public:
16 std::vector<std::string> randomSentence(int n); 80 std::vector<std::string> randomSentence(int n);
17 81
18private: 82private:
19 typedef struct 83 struct token_data
20 { 84 {
21 int all; 85 int all;
22 int titlecase; 86 int titlecase;
23 int uppercase; 87 int uppercase;
24 int period; 88 token word;
25 int startquote; 89
26 int endquote; 90 token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
27 int startparen; 91 };
28 int endparen; 92
29 int comma;
30 std::string* token;
31 } token_data;
32 int maxK; 93 int maxK;
33 std::map<kgram, std::map<int, token_data*>* >* stats; 94 std::map<kgram, std::map<int, token_data> > stats;
34 malaprop mstats; 95 malaprop mstats;
96 std::map<token, std::map<int, termstats> > endings;
35}; 97};
36 98
37void printKgram(kgram k); 99void printKgram(kgram k);
diff --git a/malaprop.cpp b/malaprop.cpp index 7fbdb6c..ccdd4c4 100644 --- a/malaprop.cpp +++ b/malaprop.cpp
@@ -117,6 +117,11 @@ std::string malaprop::alternate(std::string word)
117{ 117{
118 soundex ex = soundify(word); 118 soundex ex = soundify(word);
119 std::set<std::string>& opts = dict[ex]; 119 std::set<std::string>& opts = dict[ex];
120 if (opts.size() == 0)
121 {
122 return word;
123 }
124
120 int opt = rand() % opts.size(); 125 int opt = rand() % opts.size();
121 for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++) 126 for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++)
122 { 127 {