From 9e89002477d1358de9be9cabdc1edba26bd32836 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Mon, 4 Jan 2016 23:16:17 -0500 Subject: Rewrote quite a bit of kgramstats The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses). The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached. --- ebooks.cpp | 316 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 155 insertions(+), 161 deletions(-) (limited to 'ebooks.cpp') diff --git a/ebooks.cpp b/ebooks.cpp index 27591f4..a24bd8d 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -14,174 +14,168 @@ int main(int argc, char** args) { - srand(time(NULL)); + srand(time(NULL)); - YAML::Node config = YAML::LoadFile("config.yml"); - int delay = config["delay"].as(); - - std::ifstream infile(config["corpus"].as().c_str()); - std::string corpus; - std::string line; - while (getline(infile, line)) - { - corpus += line + "\n "; - } - - std::cout << "Preprocessing corpus..." << std::endl; - kgramstats* stats = new kgramstats(corpus, 4); + YAML::Node config = YAML::LoadFile("config.yml"); + int delay = config["delay"].as(); + + std::ifstream infile(config["corpus"].as().c_str()); + std::string corpus; + std::string line; + while (getline(infile, line)) + { + corpus += line + "\n "; + } + + std::cout << "Preprocessing corpus..." << std::endl; + kgramstats* stats = new kgramstats(corpus, 4); - std::cout << "Preprocessing freevars..." << std::endl; - freevars* vars = new freevars(); - vars->addVar("name", "names.txt"); - vars->addVar("noun", "nouns.txt"); - - std::cout << "Generating..." << std::endl; - for (;;) - { - std::vector doc = stats->randomSentence(rand() % 45 + 5); - std::string hi; - for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) - { - hi += vars->parse(*it) + " "; - } - - size_t firstperiod = hi.find_first_of(".!?"); - if (firstperiod != std::string::npos) + std::cout << "Preprocessing freevars..." << std::endl; + freevars* vars = new freevars(); + vars->addVar("name", "names.txt"); + vars->addVar("noun", "nouns.txt"); + + std::cout << "Generating..." << std::endl; + for (;;) + { + std::vector doc = stats->randomSentence(rand() % 45 + 5); + std::string hi; + for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) { - hi = hi.substr(firstperiod+2); + hi += vars->parse(*it) + " "; } - + hi.resize(140); - size_t lastperiod = hi.find_last_of(".!?"); - if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) - { - hi = hi.substr(0, lastperiod+1); - } - - twitCurl twitterObj; - std::string tmpStr, tmpStr2; - std::string replyMsg; - char tmpBuf[1024]; - std::string username(config["username"].as()); - std::string password(config["password"].as()); - - /* Set twitter username and password */ - twitterObj.setTwitterUsername(username); - twitterObj.setTwitterPassword(password); + size_t lastperiod = hi.find_last_of(".!?,"); + if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) + { + hi = hi.substr(0, lastperiod+1); + } + + twitCurl twitterObj; + std::string tmpStr, tmpStr2; + std::string replyMsg; + char tmpBuf[1024]; + std::string username(config["username"].as()); + std::string password(config["password"].as()); + + /* Set twitter username and password */ + twitterObj.setTwitterUsername(username); + twitterObj.setTwitterPassword(password); - /* OAuth flow begins */ - /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ - twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as() ); - twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as() ); - - /* Step 1: Check if we alredy have OAuth access token from a previous run */ - std::string myOAuthAccessTokenKey(""); - std::string myOAuthAccessTokenSecret(""); - std::ifstream oAuthTokenKeyIn; - std::ifstream oAuthTokenSecretIn; - - oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); - oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); - - memset( tmpBuf, 0, 1024 ); - oAuthTokenKeyIn >> tmpBuf; - myOAuthAccessTokenKey = tmpBuf; - - memset( tmpBuf, 0, 1024 ); - oAuthTokenSecretIn >> tmpBuf; - myOAuthAccessTokenSecret = tmpBuf; - - oAuthTokenKeyIn.close(); - oAuthTokenSecretIn.close(); - - if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) - { - /* If we already have these keys, then no need to go through auth again */ - printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); - - twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); - twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); - } - else - { - /* Step 2: Get request token key and secret */ - std::string authUrl; - twitterObj.oAuthRequestToken( authUrl ); - - /* Step 3: Get PIN */ - memset( tmpBuf, 0, 1024 ); - printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); - gets( tmpBuf ); - tmpStr = tmpBuf; - if( std::string::npos != tmpStr.find( "1" ) ) - { - /* Ask user to visit twitter.com auth page and get PIN */ - memset( tmpBuf, 0, 1024 ); - printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); - printf( "\nEnter the PIN provided by twitter: " ); - gets( tmpBuf ); - tmpStr = tmpBuf; - twitterObj.getOAuth().setOAuthPin( tmpStr ); - } - else - { - /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ - twitterObj.oAuthHandlePIN( authUrl ); - } - - /* Step 4: Exchange request token with access token */ - twitterObj.oAuthAccessToken(); - - /* Step 5: Now, save this access token key and secret for future use without PIN */ - twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); - twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); - - /* Step 6: Save these keys in a file or wherever */ - std::ofstream oAuthTokenKeyOut; - std::ofstream oAuthTokenSecretOut; - - oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); - oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); - - oAuthTokenKeyOut.clear(); - oAuthTokenSecretOut.clear(); - - oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); - oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); - - oAuthTokenKeyOut.close(); - oAuthTokenSecretOut.close(); - } - /* OAuth flow ends */ - - /* Account credentials verification */ - if( twitterObj.accountVerifyCredGet() ) - { - twitterObj.getLastWebResponse( replyMsg ); - printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); - } - else - { - twitterObj.getLastCurlError( replyMsg ); - printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); - } + /* OAuth flow begins */ + /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ + twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as() ); + twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as() ); + + /* Step 1: Check if we alredy have OAuth access token from a previous run */ + std::string myOAuthAccessTokenKey(""); + std::string myOAuthAccessTokenSecret(""); + std::ifstream oAuthTokenKeyIn; + std::ifstream oAuthTokenSecretIn; + + oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); + oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); + + memset( tmpBuf, 0, 1024 ); + oAuthTokenKeyIn >> tmpBuf; + myOAuthAccessTokenKey = tmpBuf; + + memset( tmpBuf, 0, 1024 ); + oAuthTokenSecretIn >> tmpBuf; + myOAuthAccessTokenSecret = tmpBuf; + + oAuthTokenKeyIn.close(); + oAuthTokenSecretIn.close(); + + if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) + { + /* If we already have these keys, then no need to go through auth again */ + printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); + + twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); + twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); + } + else + { + /* Step 2: Get request token key and secret */ + std::string authUrl; + twitterObj.oAuthRequestToken( authUrl ); + + /* Step 3: Get PIN */ + memset( tmpBuf, 0, 1024 ); + printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); + gets( tmpBuf ); + tmpStr = tmpBuf; + if( std::string::npos != tmpStr.find( "1" ) ) + { + /* Ask user to visit twitter.com auth page and get PIN */ + memset( tmpBuf, 0, 1024 ); + printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); + printf( "\nEnter the PIN provided by twitter: " ); + gets( tmpBuf ); + tmpStr = tmpBuf; + twitterObj.getOAuth().setOAuthPin( tmpStr ); + } + else + { + /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ + twitterObj.oAuthHandlePIN( authUrl ); + } + + /* Step 4: Exchange request token with access token */ + twitterObj.oAuthAccessToken(); + + /* Step 5: Now, save this access token key and secret for future use without PIN */ + twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); + twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); + + /* Step 6: Save these keys in a file or wherever */ + std::ofstream oAuthTokenKeyOut; + std::ofstream oAuthTokenSecretOut; + + oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); + oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); + + oAuthTokenKeyOut.clear(); + oAuthTokenSecretOut.clear(); + + oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); + oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); + + oAuthTokenKeyOut.close(); + oAuthTokenSecretOut.close(); + } + /* OAuth flow ends */ + + /* Account credentials verification */ + if( twitterObj.accountVerifyCredGet() ) + { + twitterObj.getLastWebResponse( replyMsg ); + printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); + } + else + { + twitterObj.getLastCurlError( replyMsg ); + printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); + } - /* Post a new status message */ - replyMsg = ""; - if( twitterObj.statusUpdate( hi ) ) - { - twitterObj.getLastWebResponse( replyMsg ); - printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); - } - else - { - twitterObj.getLastCurlError( replyMsg ); - printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); - } - - sleep(rand() % delay); - } + /* Post a new status message */ + replyMsg = ""; + if( twitterObj.statusUpdate( hi ) ) + { + twitterObj.getLastWebResponse( replyMsg ); + printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); + } + else + { + twitterObj.getLastCurlError( replyMsg ); + printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); + } + + sleep(rand() % delay); + } - return 0; + return 0; } -- cgit 1.4.1