about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--ebooks.cpp316
-rw-r--r--gen.cpp98
-rw-r--r--kgramstats.cpp442
-rw-r--r--kgramstats.h84
-rw-r--r--malaprop.cpp1
5 files changed, 439 insertions, 502 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 27591f4..a24bd8d 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -14,174 +14,168 @@
14 14
15int main(int argc, char** args) 15int main(int argc, char** args)
16{ 16{
17 srand(time(NULL)); 17 srand(time(NULL));
18 18
19 YAML::Node config = YAML::LoadFile("config.yml"); 19 YAML::Node config = YAML::LoadFile("config.yml");
20 int delay = config["delay"].as<int>(); 20 int delay = config["delay"].as<int>();
21 21
22 std::ifstream infile(config["corpus"].as<std::string>().c_str()); 22 std::ifstream infile(config["corpus"].as<std::string>().c_str());
23 std::string corpus; 23 std::string corpus;
24 std::string line; 24 std::string line;
25 while (getline(infile, line)) 25 while (getline(infile, line))
26 { 26 {
27 corpus += line + "\n "; 27 corpus += line + "\n ";
28 } 28 }
29 29
30 std::cout << "Preprocessing corpus..." << std::endl; 30 std::cout << "Preprocessing corpus..." << std::endl;
31 kgramstats* stats = new kgramstats(corpus, 4); 31 kgramstats* stats = new kgramstats(corpus, 4);
32 32
33 std::cout << "Preprocessing freevars..." << std::endl; 33 std::cout << "Preprocessing freevars..." << std::endl;
34 freevars* vars = new freevars(); 34 freevars* vars = new freevars();
35 vars->addVar("name", "names.txt"); 35 vars->addVar("name", "names.txt");
36 vars->addVar("noun", "nouns.txt"); 36 vars->addVar("noun", "nouns.txt");
37 37
38 std::cout << "Generating..." << std::endl; 38 std::cout << "Generating..." << std::endl;
39 for (;;) 39 for (;;)
40 { 40 {
41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); 41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
42 std::string hi; 42 std::string hi;
43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
44 {
45 hi += vars->parse(*it) + " ";
46 }
47
48 size_t firstperiod = hi.find_first_of(".!?");
49 if (firstperiod != std::string::npos)
50 { 44 {
51 hi = hi.substr(firstperiod+2); 45 hi += vars->parse(*it) + " ";
52 } 46 }
53 47
54 hi.resize(140); 48 hi.resize(140);
55 49
56 size_t lastperiod = hi.find_last_of(".!?"); 50 size_t lastperiod = hi.find_last_of(".!?,");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 51 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 { 52 {
59 hi = hi.substr(0, lastperiod+1); 53 hi = hi.substr(0, lastperiod+1);
60 } 54 }
61 55
62 twitCurl twitterObj; 56 twitCurl twitterObj;
63 std::string tmpStr, tmpStr2; 57 std::string tmpStr, tmpStr2;
64 std::string replyMsg; 58 std::string replyMsg;
65 char tmpBuf[1024]; 59 char tmpBuf[1024];
66 std::string username(config["username"].as<std::string>()); 60 std::string username(config["username"].as<std::string>());
67 std::string password(config["password"].as<std::string>()); 61 std::string password(config["password"].as<std::string>());
68 62
69 /* Set twitter username and password */ 63 /* Set twitter username and password */
70 twitterObj.setTwitterUsername(username); 64 twitterObj.setTwitterUsername(username);
71 twitterObj.setTwitterPassword(password); 65 twitterObj.setTwitterPassword(password);
72 66
73 /* OAuth flow begins */ 67 /* OAuth flow begins */
74 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ 68 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
75 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() ); 69 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
76 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() ); 70 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
77 71
78 /* Step 1: Check if we alredy have OAuth access token from a previous run */ 72 /* Step 1: Check if we alredy have OAuth access token from a previous run */
79 std::string myOAuthAccessTokenKey(""); 73 std::string myOAuthAccessTokenKey("");
80 std::string myOAuthAccessTokenSecret(""); 74 std::string myOAuthAccessTokenSecret("");
81 std::ifstream oAuthTokenKeyIn; 75 std::ifstream oAuthTokenKeyIn;
82 std::ifstream oAuthTokenSecretIn; 76 std::ifstream oAuthTokenSecretIn;
83 77
84 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); 78 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
85 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); 79 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
86 80
87 memset( tmpBuf, 0, 1024 ); 81 memset( tmpBuf, 0, 1024 );
88 oAuthTokenKeyIn >> tmpBuf; 82 oAuthTokenKeyIn >> tmpBuf;
89 myOAuthAccessTokenKey = tmpBuf; 83 myOAuthAccessTokenKey = tmpBuf;
90 84
91 memset( tmpBuf, 0, 1024 ); 85 memset( tmpBuf, 0, 1024 );
92 oAuthTokenSecretIn >> tmpBuf; 86 oAuthTokenSecretIn >> tmpBuf;
93 myOAuthAccessTokenSecret = tmpBuf; 87 myOAuthAccessTokenSecret = tmpBuf;
94 88
95 oAuthTokenKeyIn.close(); 89 oAuthTokenKeyIn.close();
96 oAuthTokenSecretIn.close(); 90 oAuthTokenSecretIn.close();
97 91
98 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) 92 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
99 { 93 {
100 /* If we already have these keys, then no need to go through auth again */ 94 /* If we already have these keys, then no need to go through auth again */
101 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); 95 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
102 96
103 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); 97 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
104 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); 98 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
105 } 99 }
106 else 100 else
107 { 101 {
108 /* Step 2: Get request token key and secret */ 102 /* Step 2: Get request token key and secret */
109 std::string authUrl; 103 std::string authUrl;
110 twitterObj.oAuthRequestToken( authUrl ); 104 twitterObj.oAuthRequestToken( authUrl );
111 105
112 /* Step 3: Get PIN */ 106 /* Step 3: Get PIN */
113 memset( tmpBuf, 0, 1024 ); 107 memset( tmpBuf, 0, 1024 );
114 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); 108 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
115 gets( tmpBuf ); 109 gets( tmpBuf );
116 tmpStr = tmpBuf; 110 tmpStr = tmpBuf;
117 if( std::string::npos != tmpStr.find( "1" ) ) 111 if( std::string::npos != tmpStr.find( "1" ) )
118 { 112 {
119 /* Ask user to visit twitter.com auth page and get PIN */ 113 /* Ask user to visit twitter.com auth page and get PIN */
120 memset( tmpBuf, 0, 1024 ); 114 memset( tmpBuf, 0, 1024 );
121 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); 115 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
122 printf( "\nEnter the PIN provided by twitter: " ); 116 printf( "\nEnter the PIN provided by twitter: " );
123 gets( tmpBuf ); 117 gets( tmpBuf );
124 tmpStr = tmpBuf; 118 tmpStr = tmpBuf;
125 twitterObj.getOAuth().setOAuthPin( tmpStr ); 119 twitterObj.getOAuth().setOAuthPin( tmpStr );
126 } 120 }
127 else 121 else
128 { 122 {
129 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ 123 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
130 twitterObj.oAuthHandlePIN( authUrl ); 124 twitterObj.oAuthHandlePIN( authUrl );
131 } 125 }
132 126
133 /* Step 4: Exchange request token with access token */ 127 /* Step 4: Exchange request token with access token */
134 twitterObj.oAuthAccessToken(); 128 twitterObj.oAuthAccessToken();
135 129
136 /* Step 5: Now, save this access token key and secret for future use without PIN */ 130 /* Step 5: Now, save this access token key and secret for future use without PIN */
137 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); 131 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
138 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); 132 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
139 133
140 /* Step 6: Save these keys in a file or wherever */ 134 /* Step 6: Save these keys in a file or wherever */
141 std::ofstream oAuthTokenKeyOut; 135 std::ofstream oAuthTokenKeyOut;
142 std::ofstream oAuthTokenSecretOut; 136 std::ofstream oAuthTokenSecretOut;
143 137
144 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); 138 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
145 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); 139 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
146 140
147 oAuthTokenKeyOut.clear(); 141 oAuthTokenKeyOut.clear();
148 oAuthTokenSecretOut.clear(); 142 oAuthTokenSecretOut.clear();
149 143
150 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); 144 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
151 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); 145 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
152 146
153 oAuthTokenKeyOut.close(); 147 oAuthTokenKeyOut.close();
154 oAuthTokenSecretOut.close(); 148 oAuthTokenSecretOut.close();
155 } 149 }
156 /* OAuth flow ends */ 150 /* OAuth flow ends */
157 151
158 /* Account credentials verification */ 152 /* Account credentials verification */
159 if( twitterObj.accountVerifyCredGet() ) 153 if( twitterObj.accountVerifyCredGet() )
160 { 154 {
161 twitterObj.getLastWebResponse( replyMsg ); 155 twitterObj.getLastWebResponse( replyMsg );
162 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); 156 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
163 } 157 }
164 else 158 else
165 { 159 {
166 twitterObj.getLastCurlError( replyMsg ); 160 twitterObj.getLastCurlError( replyMsg );
167 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); 161 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
168 } 162 }
169 163
170 /* Post a new status message */ 164 /* Post a new status message */
171 replyMsg = ""; 165 replyMsg = "";
172 if( twitterObj.statusUpdate( hi ) ) 166 if( twitterObj.statusUpdate( hi ) )
173 { 167 {
174 twitterObj.getLastWebResponse( replyMsg ); 168 twitterObj.getLastWebResponse( replyMsg );
175 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); 169 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
176 } 170 }
177 else 171 else
178 { 172 {
179 twitterObj.getLastCurlError( replyMsg ); 173 twitterObj.getLastCurlError( replyMsg );
180 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); 174 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
181 } 175 }
182 176
183 sleep(rand() % delay); 177 sleep(rand() % delay);
184 } 178 }
185 179
186 return 0; 180 return 0;
187} 181}
diff --git a/gen.cpp b/gen.cpp index 7e47d45..400c0a5 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -11,72 +11,66 @@
11 11
12int main(int argc, char** args) 12int main(int argc, char** args)
13{ 13{
14 srand(time(NULL)); 14 srand(time(NULL));
15 15
16 if (argc == 1) 16 if (argc == 1)
17 { 17 {
18 std::cout << "rawr-gen, version 1.0" << std::endl; 18 std::cout << "rawr-gen, version 1.0" << std::endl;
19 std::cout << "Usage: rawr-gen corpus-file" << std::endl; 19 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
20 std::cout << " where 'corpus-file' is the path to your input" << std::endl; 20 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
21 21
22 return 0; 22 return 0;
23 } 23 }
24 24
25 std::ifstream infile(args[1]); 25 std::ifstream infile(args[1]);
26 if (!infile) 26 if (!infile)
27 { 27 {
28 std::cout << "rawr-gen, version 1.0" << std::endl; 28 std::cout << "rawr-gen, version 1.0" << std::endl;
29 std::cout << "Usage: rawr-gen corpus-file" << std::endl; 29 std::cout << "Usage: rawr-gen corpus-file" << std::endl;
30 std::cout << " where 'corpus-file' is the path to your input" << std::endl; 30 std::cout << " where 'corpus-file' is the path to your input" << std::endl;
31 std::cout << std::endl; 31 std::cout << std::endl;
32 std::cout << "The file you specified does not exist." << std::endl; 32 std::cout << "The file you specified does not exist." << std::endl;
33 33
34 return 0; 34 return 0;
35 } 35 }
36 36
37 std::string corpus; 37 std::string corpus;
38 std::string line; 38 std::string line;
39 while (getline(infile, line)) 39 while (getline(infile, line))
40 { 40 {
41 corpus += line + "\n "; 41 corpus += line + "\n ";
42 } 42 }
43 43
44 std::cout << "Preprocessing corpus..." << std::endl; 44 std::cout << "Preprocessing corpus..." << std::endl;
45 kgramstats* stats = new kgramstats(corpus, 4); 45 kgramstats* stats = new kgramstats(corpus, 4);
46 46
47 std::cout << "Preprocessing freevars..." << std::endl; 47 std::cout << "Preprocessing freevars..." << std::endl;
48 freevars* vars = new freevars(); 48 freevars* vars = new freevars();
49 vars->addVar("name", "names.txt"); 49 vars->addVar("name", "names.txt");
50 vars->addVar("noun", "nouns.txt"); 50 vars->addVar("noun", "nouns.txt");
51 51
52 std::cout << "Generating..." << std::endl; 52 std::cout << "Generating..." << std::endl;
53 for (;;) 53 for (;;)
54 { 54 {
55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45); 55 std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
56 std::string hi; 56 std::string hi;
57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 57 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58 {
59 hi += vars->parse(*it) + " ";
60 }
61
62 size_t firstperiod = hi.find_first_of(".!?");
63 if (firstperiod != std::string::npos)
64 { 58 {
65 hi = hi.substr(firstperiod+2); 59 hi += vars->parse(*it) + " ";
66 } 60 }
67 61
68 hi.resize(140); 62 hi.resize(140);
69 63
70 size_t lastperiod = hi.find_last_of(".!?"); 64 size_t lastperiod = hi.find_last_of(".!?,");
71 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 65 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
72 { 66 {
73 hi = hi.substr(0, lastperiod+1); 67 hi = hi.substr(0, lastperiod+1);
74 } 68 }
75 69
76 std::cout << hi << std::endl; 70 std::cout << hi << std::endl;
77 71
78 getc(stdin); 72 getc(stdin);
79 } 73 }
80 74
81 return 0; 75 return 0;
82} 76}
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0ec68a..c88d83c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -5,237 +5,176 @@
5#include <algorithm> 5#include <algorithm>
6#include "malaprop.h" 6#include "malaprop.h"
7 7
8query wildcardQuery(querytype_sentence);
9
8std::string canonize(std::string f); 10std::string canonize(std::string f);
9 11
10// runs in O(t^2) time where t is the number of tokens in the input corpus 12// runs in O(t^2) time where t is the number of tokens in the input corpus
11// We consider maxK to be fairly constant 13// We consider maxK to be fairly constant
12kgramstats::kgramstats(std::string corpus, int maxK) 14kgramstats::kgramstats(std::string corpus, int maxK)
13{ 15{
14 this->maxK = maxK; 16 this->maxK = maxK;
15 17
16 std::vector<std::string> tokens; 18 std::vector<std::string> tokens;
17 size_t start = 0; 19 size_t start = 0;
18 int end = 0; 20 int end = 0;
19 21
20 while (end != std::string::npos) 22 while (end != std::string::npos)
21 { 23 {
22 end = corpus.find(" ", start); 24 end = corpus.find(" ", start);
23 25
24 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 26 std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25 if (token[token.length()-1] == '\n') 27 if (token[token.length()-1] == '\n')
26 { 28 {
27 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) 29 if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
28 { 30 {
29 token.insert(token.length()-1, "."); 31 token.insert(token.length()-1, ".");
30 } 32 }
31 33
32 token.resize(token.length()-1); 34 token.resize(token.length()-1);
33 } 35 }
34 36
35 if (token.compare("") && token.compare(".")) 37 if (token.compare("") && token.compare("."))
36 { 38 {
37 mstats.addWord(token); 39 mstats.addWord(token);
38 tokens.push_back(token); 40 tokens.push_back(token);
39 } 41 }
40 42
41 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 43 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
42 } 44 }
43 45
44 std::map<kgram, std::map<std::string, token_data*>* > tstats; 46 std::map<kgram, std::map<token, token_data> > tstats;
45 bool newSentence = true; 47 std::map<token, std::map<termstats, int> > tendings;
46 bool newClause = false; 48 for (int k=1; k<maxK; k++)
47 for (int k=0; k<maxK; k++) 49 {
48 { 50 for (int i=0; i<(tokens.size() - k); i++)
49 for (int i=0; i<(tokens.size() - k); i++) 51 {
50 { 52 std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
51 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 53 kgram prefix;
52 std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
53 std::string f = tokens[i+k];
54
55
56
57 std::string canonical = canonize(f);
58
59 if (tstats[seq] == NULL)
60 {
61 tstats[seq] = new std::map<std::string, token_data*>();
62 }
63
64 if ((*tstats[seq])[canonical] == NULL)
65 {
66 (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
67 }
68
69 token_data* td = tstats[seq]->at(canonical);
70 td->token = new std::string(canonical);
71 td->all++;
72 54
73 /*if (newSentence) 55 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
74 { 56 {
75 kgram newKgram(1, "."); 57 token word(canonize(*it));
76 if (tstats[newKgram] == NULL) 58
59 if (it->find_first_of(".?!,") != std::string::npos)
77 { 60 {
78 tstats[newKgram] = new std::map<std::string, token_data*>(); 61 word.terminating = true;
79 } 62 }
80 63
81 (*tstats[newKgram])[canonical] = td; 64 prefix.push_back(word);
82
83 newSentence = false;
84 } 65 }
85 66
86 if (newClause) 67 std::string f = tokens[i+k];
68 std::string canonical = canonize(f);
69
70 token word(canonical);
71 if (f.find_first_of(".?!,") != std::string::npos)
87 { 72 {
88 kgram commaKgram(1, ","); 73 word.terminating = true;
89 if (tstats[commaKgram] == NULL)
90 {
91 tstats[commaKgram] = new std::map<std::string, token_data*>();
92 }
93 74
94 (*tstats[commaKgram])[canonical] = td; 75 char terminator = f[f.find_last_of(".?!,")];
76 int occurrences = std::count(f.begin(), f.end(), terminator);
95 77
96 newClause = false; 78 tendings[word][termstats(terminator, occurrences)]++;
97 }
98
99 if ((f.length() > 0) && (f[f.length()-1] == '\n'))
100 {
101 td->period++;
102 newSentence = true;
103 f.resize(f.length()-1);
104 } 79 }
105 80
106 if (f.length() > 0) 81 token_data& td = tstats[prefix][word];
82 td.word = word;
83 td.all++;
84
85 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
107 { 86 {
108 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) 87 td.uppercase++;
109 { 88 } else if (isupper(f[0]))
110 if (!newSentence) 89 {
111 { 90 td.titlecase++;
112 td->period++;
113 newSentence = true;
114 }
115
116 f.resize(f.length()-1);
117 } else if (f[f.length()-1] == ',')
118 {
119 if (!newSentence)
120 {
121 td->comma++;
122 newClause = true;
123 }
124
125 f.resize(f.length()-1);
126 }
127 } 91 }
128 92
129 if (f.length() > 0) 93 if (prefix.front().word.terminating)
130 { 94 {
131 if (f[0] == '"') 95 prefix.front() = wildcardQuery;
132 {
133 td->startquote++;
134 }
135 96
136 if (f[0] == '(') 97 token_data& td2 = tstats[prefix][word];
98 td2.word = word;
99 td2.all++;
100
101 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
137 { 102 {
138 td->startparen++; 103 td2.uppercase++;
139 } 104 } else if (isupper(f[0]))
140
141 if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
142 { 105 {
143 if (f[f.length()-1] == '"') 106 td2.titlecase++;
144 {
145 td->endquote++;
146 } else if (f[f.length()-1] == ')')
147 {
148 td->endparen++;
149 }
150
151 f.resize(f.length()-1);
152
153 if (f.length() > 0)
154 {
155 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
156 {
157 if (!newSentence)
158 {
159 td->period++;
160 newSentence = true;
161 }
162 } else if (f[f.length()-1] == ',')
163 {
164 if (!newSentence && !newClause)
165 {
166 td->comma++;
167 newClause = true;
168 }
169 }
170 }
171 }
172 }*/
173
174 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
175 {
176 td->uppercase++;
177 } else if (isupper(f[0]))
178 {
179 td->titlecase++;
180 }
181
182 /*if (k != 0)
183 {
184 if (newSentence)
185 {
186 i += k;
187 } 107 }
188 108 }
189 newSentence = false; 109 }
190 newClause = false; 110 }
191 }*/
192 }
193 }
194 111
195 stats = new std::map<kgram, std::map<int, token_data*>* >(); 112 for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
196 for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) 113 {
197 { 114 kgram klist = it->first;
198 kgram klist = it->first; 115 std::map<token, token_data>& probtable = it->second;
199 std::map<std::string, token_data*>* probtable = it->second; 116 std::map<int, token_data>& distribution = stats[klist];
200 std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); 117 int max = 0;
201 int max = 0;
202 118
203 for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) 119 for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
204 { 120 {
205 max += kt->second->all; 121 max += kt->second.all;
206 122
207 (*distribution)[max] = kt->second; 123 distribution[max] = kt->second;
208 } 124 }
209 125 }
210 (*stats)[klist] = distribution; 126
211 } 127 for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
128 {
129 token word = it->first;
130 std::map<termstats, int>& probtable = it->second;
131 std::map<int, termstats>& distribution = endings[word];
132 int max = 0;
133
134 for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
135 {
136 max += kt->second;
137
138 distribution[max] = kt->first;
139 }
140 }
212} 141}
213 142
214void printKgram(kgram k) 143void printKgram(kgram k)
215{ 144{
216 for (kgram::iterator it = k.begin(); it != k.end(); it++) 145 for (kgram::iterator it = k.begin(); it != k.end(); it++)
217 { 146 {
218 std::cout << *it << " "; 147 query& q = *it;
219 } 148 if (q.type == querytype_sentence)
149 {
150 std::cout << "#.# ";
151 } else if (q.type == querytype_literal)
152 {
153 if (q.word.terminating)
154 {
155 std::cout << q.word.canon << ". ";
156 } else {
157 std::cout << q.word.canon << " ";
158 }
159 }
160 }
220} 161}
221 162
222// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 163// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
223std::vector<std::string> kgramstats::randomSentence(int n) 164std::vector<std::string> kgramstats::randomSentence(int n)
224{ 165{
225 std::vector<std::string> result; 166 std::vector<std::string> result;
226 kgram newKgram(1, "."); 167 kgram cur(1, wildcardQuery);
227 kgram commaKgram(1, ",");
228 std::list<std::string> cur;
229 int cuts = 0; 168 int cuts = 0;
230 169
231 for (int i=0; i<n; i++) 170 for (int i=0; i<n; i++)
232 { 171 {
233 if (cur.size() == maxK) 172 if (cur.size() == maxK)
234 { 173 {
235 cur.pop_front(); 174 cur.pop_front();
236 } 175 }
237 176
238 if ((cur.size() > 0) && (cur != newKgram)) 177 if (cur.size() > 0)
239 { 178 {
240 if (rand() % (maxK - cur.size() + 1) == 0) 179 if (rand() % (maxK - cur.size() + 1) == 0)
241 { 180 {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
253 192
254 cuts++; 193 cuts++;
255 } 194 }
195
196 // Gotta circumvent the last line of the input corpus
197 // https://twitter.com/starla4444/status/684222271339237376
198 if (stats.count(cur) == 0)
199 {
200 cur = kgram(1, wildcardQuery);
201 }
256 202
257 std::map<int, token_data*> distribution = *(*stats)[cur]; 203 std::map<int, token_data>& distribution = stats[cur];
258 int max = distribution.rbegin()->first; 204 int max = distribution.rbegin()->first;
259 int r = rand() % max; 205 int r = rand() % max;
260 token_data* next = distribution.upper_bound(r)->second; 206 token_data& next = distribution.upper_bound(r)->second;
261 207 std::string nextToken(next.word.canon);
262 std::string nextToken(*(next->token));
263 int casing = rand() % next->all;
264 /*int period = rand() % next->all;
265 int startparen = rand() % next->all;
266 int endparen = rand() % next->all;
267 int startquote = rand() % next->all;
268 int endquote = rand() % next->all;
269 int comma = rand() % next->all;*/
270 208
271 bool mess = (rand() % 100) == 0; 209 bool mess = (rand() % 100) == 0;
272 if (mess) 210 if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
274 nextToken = mstats.alternate(nextToken); 212 nextToken = mstats.alternate(nextToken);
275 } 213 }
276 214
277 if (casing < next->uppercase) 215 // Determine the casing of the next token. We randomly make the token all
278 { 216 // caps based on the markov chain. Otherwise, we check if the previous
279 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 217 // token is the end of a sentence (terminating token or a wildcard query).
280 } 218 int casing = rand() % next.all;
281 219 if (casing < next.uppercase)
282 if ((cur == newKgram) && (rand() % 15 > 0)) 220 {
221 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
222 } else if ((((cur.rbegin()->type == querytype_sentence)
223 || ((cur.rbegin()->type == querytype_literal)
224 && (cur.rbegin()->word.terminating)))
225 && (rand() % 2 > 0))
226 || (casing - next.uppercase < next.titlecase))
283 { 227 {
284 nextToken[0] = toupper(nextToken[0]); 228 nextToken[0] = toupper(nextToken[0]);
285 } 229 }
286 230
287 /*if (startquote < next->startquote) 231 if (next.word.terminating)
288 {
289 nextToken = "\"" + nextToken;
290 } else if (startparen < next->startparen)
291 { 232 {
292 nextToken = "(" + nextToken; 233 std::map<int, termstats>& ending = endings[next.word];
234 int emax = ending.rbegin()->first;
235 int er = rand() % emax;
236 termstats& nextend = ending.upper_bound(er)->second;
237
238 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
293 } 239 }
294
295 if (period < next->period)
296 {
297 if (endquote < next->endquote)
298 {
299 nextToken += "\"";
300 } else if (endparen < next->endparen)
301 {
302 nextToken += ")";
303 }
304
305 int type = rand() % 6;
306
307 if (type < 3)
308 {
309 nextToken += ".";
310 } else if (type < 5)
311 {
312 nextToken += "!";
313 } else {
314 nextToken += "?";
315 }
316 } else if (comma < next->comma)
317 {
318 if (endquote < next->endquote)
319 {
320 nextToken += "\"";
321 } else if (endparen < next->endparen)
322 {
323 nextToken += ")";
324 }
325
326 nextToken += ",";
327 }*/
328 240
329 /* DEBUG */ 241 /* DEBUG */
330 for (kgram::iterator it = cur.begin(); it != cur.end(); it++) 242 printKgram(cur);
331 {
332 std::cout << *it << " ";
333 }
334 243
335 std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; 244 std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
336 245
337 if (mess) 246 if (mess)
338 { 247 {
339 std::cout << " mala " << *(next->token); 248 std::cout << " mala " << next.word.canon;
340 } 249 }
341 250
342 std::cout << std::endl; 251 std::cout << std::endl;
343 252
344 /*if ((cur == newKgram) || (cur == commaKgram)) 253 cur.push_back(next.word);
345 {
346 cur.pop_front();
347 }
348
349 if (period < next->period)// && ((rand() % 3) != 0))
350 {
351 cur = newKgram;
352 } else if ((comma < next->comma) && ((rand() % 3) == 0))
353 {
354 cur = commaKgram;
355 } else {*/
356 //if (mess && (rand() % 2 == 0))
357 if (false)
358 {
359 // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
360 cur.clear();
361 cur.push_back(nextToken);
362 } else {
363 cur.push_back(*(next->token));
364 }
365 //}
366 254
367 result.push_back(nextToken); 255 result.push_back(nextToken);
368 } 256 }
369 257
370 return result; 258 return result;
371} 259}
372 260
373bool removeIf(char c) 261bool removeIf(char c)
374{ 262{
375 return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); 263 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
376} 264}
377 265
378std::string canonize(std::string f) 266std::string canonize(std::string f)
379{ 267{
380 std::string canonical(f); 268 std::string canonical(f);
381 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 269 std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
382 270
383 std::string result; 271 std::string result;
384 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); 272 std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
385 273
386 return canonical; 274 return result;
387} 275}
diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,7 +7,71 @@
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10typedef std::list<std::string> kgram; 10struct token {
11 std::string canon;
12 bool terminating;
13
14 token(std::string canon) : canon(canon), terminating(false) {}
15
16 bool operator<(const token& other) const
17 {
18 if (canon == other.canon)
19 {
20 return !terminating && other.terminating;
21 } else {
22 return canon < other.canon;
23 }
24 }
25};
26
27enum querytype {
28 querytype_literal,
29 querytype_sentence
30};
31
32struct query {
33 querytype type;
34 token word;
35
36 query(token word) : word(word), type(querytype_literal) {}
37
38 query(querytype type) : word(""), type(type) {}
39
40 bool operator<(const query& other) const
41 {
42 if (type == other.type)
43 {
44 return word < other.word;
45 } else {
46 return type < other.type;
47 }
48 }
49};
50
51typedef std::list<query> kgram;
52
53struct termstats {
54 char terminator;
55 int occurrences;
56
57 termstats() : terminator('.'), occurrences(1) {}
58
59 termstats(char terminator, int occurrences)
60 {
61 this->terminator = terminator;
62 this->occurrences = occurrences;
63 }
64
65 bool operator<(const termstats& other) const
66 {
67 if (terminator == other.terminator)
68 {
69 return occurrences < other.occurrences;
70 } else {
71 return terminator < other.terminator;
72 }
73 }
74};
11 75
12class kgramstats 76class kgramstats
13{ 77{
@@ -16,22 +80,20 @@ public:
16 std::vector<std::string> randomSentence(int n); 80 std::vector<std::string> randomSentence(int n);
17 81
18private: 82private:
19 typedef struct 83 struct token_data
20 { 84 {
21 int all; 85 int all;
22 int titlecase; 86 int titlecase;
23 int uppercase; 87 int uppercase;
24 int period; 88 token word;
25 int startquote; 89
26 int endquote; 90 token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
27 int startparen; 91 };
28 int endparen; 92
29 int comma;
30 std::string* token;
31 } token_data;
32 int maxK; 93 int maxK;
33 std::map<kgram, std::map<int, token_data*>* >* stats; 94 std::map<kgram, std::map<int, token_data> > stats;
34 malaprop mstats; 95 malaprop mstats;
96 std::map<token, std::map<int, termstats> > endings;
35}; 97};
36 98
37void printKgram(kgram k); 99void printKgram(kgram k);
diff --git a/malaprop.cpp b/malaprop.cpp index c308d34..ff0cb5d 100644 --- a/malaprop.cpp +++ b/malaprop.cpp
@@ -119,7 +119,6 @@ std::string malaprop::alternate(std::string word)
119 std::set<std::string>& opts = dict[ex]; 119 std::set<std::string>& opts = dict[ex];
120 if (opts.size() == 0) 120 if (opts.size() == 0)
121 { 121 {
122 // Not sure why this can even happen but it is happenining????
123 return word; 122 return word;
124 } 123 }
125 124