about summary refs log tree commit diff stats
path: root/ebooks.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-04 23:16:17 -0500
commit9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree9afb52740fe4f618105d014a816df26b36ed83f6 /ebooks.cpp
parent0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
downloadrawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
Rewrote quite a bit of kgramstats
The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses).

The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
Diffstat (limited to 'ebooks.cpp')
-rw-r--r--ebooks.cpp316
1 files changed, 155 insertions, 161 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 27591f4..a24bd8d 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -14,174 +14,168 @@
14 14
15int main(int argc, char** args) 15int main(int argc, char** args)
16{ 16{
17 srand(time(NULL)); 17 srand(time(NULL));
18 18
19 YAML::Node config = YAML::LoadFile("config.yml"); 19 YAML::Node config = YAML::LoadFile("config.yml");
20 int delay = config["delay"].as<int>(); 20 int delay = config["delay"].as<int>();
21 21
22 std::ifstream infile(config["corpus"].as<std::string>().c_str()); 22 std::ifstream infile(config["corpus"].as<std::string>().c_str());
23 std::string corpus; 23 std::string corpus;
24 std::string line; 24 std::string line;
25 while (getline(infile, line)) 25 while (getline(infile, line))
26 { 26 {
27 corpus += line + "\n "; 27 corpus += line + "\n ";
28 } 28 }
29 29
30 std::cout << "Preprocessing corpus..." << std::endl; 30 std::cout << "Preprocessing corpus..." << std::endl;
31 kgramstats* stats = new kgramstats(corpus, 4); 31 kgramstats* stats = new kgramstats(corpus, 4);
32 32
33 std::cout << "Preprocessing freevars..." << std::endl; 33 std::cout << "Preprocessing freevars..." << std::endl;
34 freevars* vars = new freevars(); 34 freevars* vars = new freevars();
35 vars->addVar("name", "names.txt"); 35 vars->addVar("name", "names.txt");
36 vars->addVar("noun", "nouns.txt"); 36 vars->addVar("noun", "nouns.txt");
37 37
38 std::cout << "Generating..." << std::endl; 38 std::cout << "Generating..." << std::endl;
39 for (;;) 39 for (;;)
40 { 40 {
41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); 41 std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5);
42 std::string hi; 42 std::string hi;
43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) 43 for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
44 {
45 hi += vars->parse(*it) + " ";
46 }
47
48 size_t firstperiod = hi.find_first_of(".!?");
49 if (firstperiod != std::string::npos)
50 { 44 {
51 hi = hi.substr(firstperiod+2); 45 hi += vars->parse(*it) + " ";
52 } 46 }
53 47
54 hi.resize(140); 48 hi.resize(140);
55 49
56 size_t lastperiod = hi.find_last_of(".!?"); 50 size_t lastperiod = hi.find_last_of(".!?,");
57 if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) 51 if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
58 { 52 {
59 hi = hi.substr(0, lastperiod+1); 53 hi = hi.substr(0, lastperiod+1);
60 } 54 }
61 55
62 twitCurl twitterObj; 56 twitCurl twitterObj;
63 std::string tmpStr, tmpStr2; 57 std::string tmpStr, tmpStr2;
64 std::string replyMsg; 58 std::string replyMsg;
65 char tmpBuf[1024]; 59 char tmpBuf[1024];
66 std::string username(config["username"].as<std::string>()); 60 std::string username(config["username"].as<std::string>());
67 std::string password(config["password"].as<std::string>()); 61 std::string password(config["password"].as<std::string>());
68 62
69 /* Set twitter username and password */ 63 /* Set twitter username and password */
70 twitterObj.setTwitterUsername(username); 64 twitterObj.setTwitterUsername(username);
71 twitterObj.setTwitterPassword(password); 65 twitterObj.setTwitterPassword(password);
72 66
73 /* OAuth flow begins */ 67 /* OAuth flow begins */
74 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ 68 /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */
75 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() ); 69 twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() );
76 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() ); 70 twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() );
77 71
78 /* Step 1: Check if we alredy have OAuth access token from a previous run */ 72 /* Step 1: Check if we alredy have OAuth access token from a previous run */
79 std::string myOAuthAccessTokenKey(""); 73 std::string myOAuthAccessTokenKey("");
80 std::string myOAuthAccessTokenSecret(""); 74 std::string myOAuthAccessTokenSecret("");
81 std::ifstream oAuthTokenKeyIn; 75 std::ifstream oAuthTokenKeyIn;
82 std::ifstream oAuthTokenSecretIn; 76 std::ifstream oAuthTokenSecretIn;
83 77
84 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); 78 oAuthTokenKeyIn.open( "twitterClient_token_key.txt" );
85 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); 79 oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" );
86 80
87 memset( tmpBuf, 0, 1024 ); 81 memset( tmpBuf, 0, 1024 );
88 oAuthTokenKeyIn >> tmpBuf; 82 oAuthTokenKeyIn >> tmpBuf;
89 myOAuthAccessTokenKey = tmpBuf; 83 myOAuthAccessTokenKey = tmpBuf;
90 84
91 memset( tmpBuf, 0, 1024 ); 85 memset( tmpBuf, 0, 1024 );
92 oAuthTokenSecretIn >> tmpBuf; 86 oAuthTokenSecretIn >> tmpBuf;
93 myOAuthAccessTokenSecret = tmpBuf; 87 myOAuthAccessTokenSecret = tmpBuf;
94 88
95 oAuthTokenKeyIn.close(); 89 oAuthTokenKeyIn.close();
96 oAuthTokenSecretIn.close(); 90 oAuthTokenSecretIn.close();
97 91
98 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) 92 if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() )
99 { 93 {
100 /* If we already have these keys, then no need to go through auth again */ 94 /* If we already have these keys, then no need to go through auth again */
101 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); 95 printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() );
102 96
103 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); 97 twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey );
104 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); 98 twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret );
105 } 99 }
106 else 100 else
107 { 101 {
108 /* Step 2: Get request token key and secret */ 102 /* Step 2: Get request token key and secret */
109 std::string authUrl; 103 std::string authUrl;
110 twitterObj.oAuthRequestToken( authUrl ); 104 twitterObj.oAuthRequestToken( authUrl );
111 105
112 /* Step 3: Get PIN */ 106 /* Step 3: Get PIN */
113 memset( tmpBuf, 0, 1024 ); 107 memset( tmpBuf, 0, 1024 );
114 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); 108 printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " );
115 gets( tmpBuf ); 109 gets( tmpBuf );
116 tmpStr = tmpBuf; 110 tmpStr = tmpBuf;
117 if( std::string::npos != tmpStr.find( "1" ) ) 111 if( std::string::npos != tmpStr.find( "1" ) )
118 { 112 {
119 /* Ask user to visit twitter.com auth page and get PIN */ 113 /* Ask user to visit twitter.com auth page and get PIN */
120 memset( tmpBuf, 0, 1024 ); 114 memset( tmpBuf, 0, 1024 );
121 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); 115 printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() );
122 printf( "\nEnter the PIN provided by twitter: " ); 116 printf( "\nEnter the PIN provided by twitter: " );
123 gets( tmpBuf ); 117 gets( tmpBuf );
124 tmpStr = tmpBuf; 118 tmpStr = tmpBuf;
125 twitterObj.getOAuth().setOAuthPin( tmpStr ); 119 twitterObj.getOAuth().setOAuthPin( tmpStr );
126 } 120 }
127 else 121 else
128 { 122 {
129 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ 123 /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */
130 twitterObj.oAuthHandlePIN( authUrl ); 124 twitterObj.oAuthHandlePIN( authUrl );
131 } 125 }
132 126
133 /* Step 4: Exchange request token with access token */ 127 /* Step 4: Exchange request token with access token */
134 twitterObj.oAuthAccessToken(); 128 twitterObj.oAuthAccessToken();
135 129
136 /* Step 5: Now, save this access token key and secret for future use without PIN */ 130 /* Step 5: Now, save this access token key and secret for future use without PIN */
137 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); 131 twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey );
138 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); 132 twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret );
139 133
140 /* Step 6: Save these keys in a file or wherever */ 134 /* Step 6: Save these keys in a file or wherever */
141 std::ofstream oAuthTokenKeyOut; 135 std::ofstream oAuthTokenKeyOut;
142 std::ofstream oAuthTokenSecretOut; 136 std::ofstream oAuthTokenSecretOut;
143 137
144 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); 138 oAuthTokenKeyOut.open( "twitterClient_token_key.txt" );
145 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); 139 oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" );
146 140
147 oAuthTokenKeyOut.clear(); 141 oAuthTokenKeyOut.clear();
148 oAuthTokenSecretOut.clear(); 142 oAuthTokenSecretOut.clear();
149 143
150 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); 144 oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str();
151 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); 145 oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str();
152 146
153 oAuthTokenKeyOut.close(); 147 oAuthTokenKeyOut.close();
154 oAuthTokenSecretOut.close(); 148 oAuthTokenSecretOut.close();
155 } 149 }
156 /* OAuth flow ends */ 150 /* OAuth flow ends */
157 151
158 /* Account credentials verification */ 152 /* Account credentials verification */
159 if( twitterObj.accountVerifyCredGet() ) 153 if( twitterObj.accountVerifyCredGet() )
160 { 154 {
161 twitterObj.getLastWebResponse( replyMsg ); 155 twitterObj.getLastWebResponse( replyMsg );
162 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); 156 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() );
163 } 157 }
164 else 158 else
165 { 159 {
166 twitterObj.getLastCurlError( replyMsg ); 160 twitterObj.getLastCurlError( replyMsg );
167 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); 161 printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() );
168 } 162 }
169 163
170 /* Post a new status message */ 164 /* Post a new status message */
171 replyMsg = ""; 165 replyMsg = "";
172 if( twitterObj.statusUpdate( hi ) ) 166 if( twitterObj.statusUpdate( hi ) )
173 { 167 {
174 twitterObj.getLastWebResponse( replyMsg ); 168 twitterObj.getLastWebResponse( replyMsg );
175 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); 169 printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() );
176 } 170 }
177 else 171 else
178 { 172 {
179 twitterObj.getLastCurlError( replyMsg ); 173 twitterObj.getLastCurlError( replyMsg );
180 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); 174 printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() );
181 } 175 }
182 176
183 sleep(rand() % delay); 177 sleep(rand() % delay);
184 } 178 }
185 179
186 return 0; 180 return 0;
187} 181}