From 8d28a8e13dbe602783a505adb1df375b0d65efe0 Mon Sep 17 00:00:00 2001 From: Feffernoose Date: Sun, 6 Oct 2013 19:51:45 -0400 Subject: Split rawr-ebooks and rawr-gen Also wrote README --- Makefile.am | 11 ++-- README.md | 40 ++++++++++++-- ebooks.cpp | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ gen.cpp | 48 +++++++++++++++++ main.cpp | 167 ----------------------------------------------------------- 5 files changed, 261 insertions(+), 174 deletions(-) create mode 100644 ebooks.cpp create mode 100644 gen.cpp delete mode 100644 main.cpp diff --git a/Makefile.am b/Makefile.am index c5b52ce..c9f61cf 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,10 @@ AUTOMAKE_OPTIONS = subdir-objects ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -bin_PROGRAMS = rawr-ebooks -rawr_ebooks_SOURCES = main.cpp kgramstats.cpp -AM_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file +bin_PROGRAMS = rawr-ebooks rawr-gen +rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp +rawr_gen_SOURCES = gen.cpp kgramstats.cpp +rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) +AM_CPPFLAGS = $(YAML_CFLAGS) +rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) +rawr_gen_LDADD = $(YAML_LIBS) \ No newline at end of file diff --git a/README.md b/README.md index 1462a9c..e01eb45 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,38 @@ -rawr-ebooks -=========== +# rawr-ebooks -you know +*I suddenly found it very hilarious.* --[@Rawr_Ebooks](https://twitter.com/Rawr_Ebooks/status/385131476141879296) + +rawr-ebooks is a very good example of taking things too far. One of the assignments in the algorithms course I took was to implement an algorithm in SML that would generate nonsense statistically similar to an input corpus (basically, a plain text file with words and sentences in it). Of course, the actual point of the assignment was more focused on finding an algorithm that would do this in certain required cost bounds, but after the assignment ended, I decided that the project was too fun to let go and, combined with the recent revelation that [@Horse_Ebooks](https://twitter.com/Horse_Ebooks) was actually not a bot as widely believed, decided to augment my algorithm with the ability to post to Twitter. + +rawr-ebooks actually consists of two programs: `rawr-ebooks`, which generates nonsense and posts it to a Twitter account every hour, and `rawr-gen`, which generates nonsense on command. `rawr-gen` is probably more useful for the casual, well, anybody. + +Here is how one would go about compiling `rawr-gen`: + +1. Clone rawr-ebooks onto your computer. + +
git clone http://github.com/hatkirby/rawr-ebooks
+ +2. Use autoconf and automake to generate the configure file + +
autoreconf --install --force
+ +3. Run configure + +
./configure
+ +4. Make + +
make rawr-gen
+ +5. Rename `config-example.yml` to `config.yml` and within it, replace `corpus.txt` with the path to your input +6. Run `rawr-gen` + +
./rawr-gen
+ +## Implementation details + +I ended up rewriting the algorithm in C++ as the SML implementation did not handle randomization very well and would have been very difficult to adjust to post to Twitter. The new version has many improvements that improve the quality of the generated output, and the input corpus that I use for @Rawr_Ebooks is growing every day. As of October 6th, 2013, it is about 200,000 words long. + +rawr-ebooks uses [yamlcpp](https://code.google.com/p/yaml-cpp/) to read configuration data from a file (mainly, where the input corpus is located, and the information used to connect to Twitter), and [twitcurl](https://code.google.com/p/twitcurl/) to post to Twitter. + +The program is roughly divided into two stages: a preprocessing stage and a generation stage. The preprocessing stage runs once at the beginning of the program's run and generates information to ease in the generation of output. This stage runs in O(t^2) time where t is the number of tokens in the input corpus. As you can probably tell, the preprocessing stage can take a fair bit of time to run sometimes. The generation stage actually generates the output and can occur multiple times per program run (in fact it should, otherwise you aren't making good use of the time spent during the preprocessing stage!). It runs in O(n log t) time, where t is the number of tokens in the input corpus, and n is the number of words to generate, which is usually between 5 and 50. As you can see, the generation stage runs far, far more quickly than the preprocessing stage. \ No newline at end of file diff --git a/ebooks.cpp b/ebooks.cpp new file mode 100644 index 0000000..ed660a9 --- /dev/null +++ b/ebooks.cpp @@ -0,0 +1,169 @@ +#include +#include +#include +#include "kgramstats.h" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace::std; + +int main(int argc, char** args) +{ + srand(time(NULL)); + + YAML::Node config = YAML::LoadFile("config.yml"); + ifstream infile(config["corpus"].as().c_str()); + string corpus; + string line; + while (getline(infile, line)) + { + corpus += " " + line; + } + + cout << "Preprocessing corpus..." << endl; + kgramstats* stats = new kgramstats(corpus, 5); + + cout << "Generating..." << endl; + for (;;) + { + vector doc = stats->randomSentence(rand() % 25 + 5); + string hi; + for (vector::iterator it = doc.begin(); it != doc.end(); ++it) + { + hi += *it + " "; + } + + hi = hi.substr(0,140); + + twitCurl twitterObj; + std::string tmpStr, tmpStr2; + std::string replyMsg; + char tmpBuf[1024]; + std::string username(config["username"].as()); + std::string password(config["password"].as()); + + /* Set twitter username and password */ + twitterObj.setTwitterUsername(username); + twitterObj.setTwitterPassword(password); + + /* OAuth flow begins */ + /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ + twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as() ); + twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as() ); + + /* Step 1: Check if we alredy have OAuth access token from a previous run */ + std::string myOAuthAccessTokenKey(""); + std::string myOAuthAccessTokenSecret(""); + std::ifstream oAuthTokenKeyIn; + std::ifstream oAuthTokenSecretIn; + + oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); + oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); + + memset( tmpBuf, 0, 1024 ); + oAuthTokenKeyIn >> tmpBuf; + myOAuthAccessTokenKey = tmpBuf; + + memset( tmpBuf, 0, 1024 ); + oAuthTokenSecretIn >> tmpBuf; + myOAuthAccessTokenSecret = tmpBuf; + + oAuthTokenKeyIn.close(); + oAuthTokenSecretIn.close(); + + if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) + { + /* If we already have these keys, then no need to go through auth again */ + printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); + + twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); + twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); + } + else + { + /* Step 2: Get request token key and secret */ + std::string authUrl; + twitterObj.oAuthRequestToken( authUrl ); + + /* Step 3: Get PIN */ + memset( tmpBuf, 0, 1024 ); + printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); + gets( tmpBuf ); + tmpStr = tmpBuf; + if( std::string::npos != tmpStr.find( "1" ) ) + { + /* Ask user to visit twitter.com auth page and get PIN */ + memset( tmpBuf, 0, 1024 ); + printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); + printf( "\nEnter the PIN provided by twitter: " ); + gets( tmpBuf ); + tmpStr = tmpBuf; + twitterObj.getOAuth().setOAuthPin( tmpStr ); + } + else + { + /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ + twitterObj.oAuthHandlePIN( authUrl ); + } + + /* Step 4: Exchange request token with access token */ + twitterObj.oAuthAccessToken(); + + /* Step 5: Now, save this access token key and secret for future use without PIN */ + twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); + twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); + + /* Step 6: Save these keys in a file or wherever */ + std::ofstream oAuthTokenKeyOut; + std::ofstream oAuthTokenSecretOut; + + oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); + oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); + + oAuthTokenKeyOut.clear(); + oAuthTokenSecretOut.clear(); + + oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); + oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); + + oAuthTokenKeyOut.close(); + oAuthTokenSecretOut.close(); + } + /* OAuth flow ends */ + + /* Account credentials verification */ + if( twitterObj.accountVerifyCredGet() ) + { + twitterObj.getLastWebResponse( replyMsg ); + printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); + } + else + { + twitterObj.getLastCurlError( replyMsg ); + printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); + } + + /* Post a new status message */ + replyMsg = ""; + if( twitterObj.statusUpdate( hi ) ) + { + twitterObj.getLastWebResponse( replyMsg ); + printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); + } + else + { + twitterObj.getLastCurlError( replyMsg ); + printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); + } + + sleep(900); + } + + return 0; +} \ No newline at end of file diff --git a/gen.cpp b/gen.cpp new file mode 100644 index 0000000..dc73e0f --- /dev/null +++ b/gen.cpp @@ -0,0 +1,48 @@ +#include +#include +#include +#include "kgramstats.h" +#include +#include +#include +#include +#include +#include +#include + +using namespace::std; + +int main(int argc, char** args) +{ + srand(time(NULL)); + + YAML::Node config = YAML::LoadFile("config.yml"); + + ifstream infile(config["corpus"].as().c_str()); + string corpus; + string line; + while (getline(infile, line)) + { + corpus += " " + line; + } + + cout << "Preprocessing corpus..." << endl; + kgramstats* stats = new kgramstats(corpus, 5); + + cout << "Generating..." << endl; + for (;;) + { + vector doc = stats->randomSentence(rand() % 35 + 15); + string hi; + for (vector::iterator it = doc.begin(); it != doc.end(); ++it) + { + hi += *it + " "; + } + + cout << hi << endl; + + getc(stdin); + } + + return 0; +} \ No newline at end of file diff --git a/main.cpp b/main.cpp deleted file mode 100644 index 20f1a1f..0000000 --- a/main.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include -#include -#include -#include "kgramstats.h" -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace::std; - -int main(int argc, char** args) -{ - srand(time(NULL)); - - YAML::Node config = YAML::LoadFile("config.yml"); - ifstream infile(config["corpus"].as().c_str()); - string corpus; - string line; - while (getline(infile, line)) - { - corpus += " " + line; - } - - kgramstats* stats = new kgramstats(corpus, 5); - - for (;;) - { - vector doc = stats->randomSentence(rand() % 25 + 5); - string hi; - for (vector::iterator it = doc.begin(); it != doc.end(); ++it) - { - hi += *it + " "; - } - - hi = hi.substr(0,140); - - twitCurl twitterObj; - std::string tmpStr, tmpStr2; - std::string replyMsg; - char tmpBuf[1024]; - std::string username(config["username"].as()); - std::string password(config["password"].as()); - - /* Set twitter username and password */ - twitterObj.setTwitterUsername(username); - twitterObj.setTwitterPassword(password); - - /* OAuth flow begins */ - /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ - twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as() ); - twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as() ); - - /* Step 1: Check if we alredy have OAuth access token from a previous run */ - std::string myOAuthAccessTokenKey(""); - std::string myOAuthAccessTokenSecret(""); - std::ifstream oAuthTokenKeyIn; - std::ifstream oAuthTokenSecretIn; - - oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); - oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); - - memset( tmpBuf, 0, 1024 ); - oAuthTokenKeyIn >> tmpBuf; - myOAuthAccessTokenKey = tmpBuf; - - memset( tmpBuf, 0, 1024 ); - oAuthTokenSecretIn >> tmpBuf; - myOAuthAccessTokenSecret = tmpBuf; - - oAuthTokenKeyIn.close(); - oAuthTokenSecretIn.close(); - - if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) - { - /* If we already have these keys, then no need to go through auth again */ - printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); - - twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); - twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); - } - else - { - /* Step 2: Get request token key and secret */ - std::string authUrl; - twitterObj.oAuthRequestToken( authUrl ); - - /* Step 3: Get PIN */ - memset( tmpBuf, 0, 1024 ); - printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); - gets( tmpBuf ); - tmpStr = tmpBuf; - if( std::string::npos != tmpStr.find( "1" ) ) - { - /* Ask user to visit twitter.com auth page and get PIN */ - memset( tmpBuf, 0, 1024 ); - printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); - printf( "\nEnter the PIN provided by twitter: " ); - gets( tmpBuf ); - tmpStr = tmpBuf; - twitterObj.getOAuth().setOAuthPin( tmpStr ); - } - else - { - /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ - twitterObj.oAuthHandlePIN( authUrl ); - } - - /* Step 4: Exchange request token with access token */ - twitterObj.oAuthAccessToken(); - - /* Step 5: Now, save this access token key and secret for future use without PIN */ - twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); - twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); - - /* Step 6: Save these keys in a file or wherever */ - std::ofstream oAuthTokenKeyOut; - std::ofstream oAuthTokenSecretOut; - - oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); - oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); - - oAuthTokenKeyOut.clear(); - oAuthTokenSecretOut.clear(); - - oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); - oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); - - oAuthTokenKeyOut.close(); - oAuthTokenSecretOut.close(); - } - /* OAuth flow ends */ - - /* Account credentials verification */ - if( twitterObj.accountVerifyCredGet() ) - { - twitterObj.getLastWebResponse( replyMsg ); - printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); - } - else - { - twitterObj.getLastCurlError( replyMsg ); - printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); - } - - /* Post a new status message */ - replyMsg = ""; - if( twitterObj.statusUpdate( hi ) ) - { - twitterObj.getLastWebResponse( replyMsg ); - printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); - } - else - { - twitterObj.getLastCurlError( replyMsg ); - printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); - } - - sleep(900); - } - - return 0; -} \ No newline at end of file -- cgit 1.4.1