diff options
| -rw-r--r-- | ebooks.cpp | 316 | ||||
| -rw-r--r-- | gen.cpp | 98 | ||||
| -rw-r--r-- | kgramstats.cpp | 442 | ||||
| -rw-r--r-- | kgramstats.h | 84 | ||||
| -rw-r--r-- | malaprop.cpp | 5 |
5 files changed, 444 insertions, 501 deletions
| diff --git a/ebooks.cpp b/ebooks.cpp index 27591f4..a24bd8d 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -14,174 +14,168 @@ | |||
| 14 | 14 | ||
| 15 | int main(int argc, char** args) | 15 | int main(int argc, char** args) |
| 16 | { | 16 | { |
| 17 | srand(time(NULL)); | 17 | srand(time(NULL)); |
| 18 | 18 | ||
| 19 | YAML::Node config = YAML::LoadFile("config.yml"); | 19 | YAML::Node config = YAML::LoadFile("config.yml"); |
| 20 | int delay = config["delay"].as<int>(); | 20 | int delay = config["delay"].as<int>(); |
| 21 | 21 | ||
| 22 | std::ifstream infile(config["corpus"].as<std::string>().c_str()); | 22 | std::ifstream infile(config["corpus"].as<std::string>().c_str()); |
| 23 | std::string corpus; | 23 | std::string corpus; |
| 24 | std::string line; | 24 | std::string line; |
| 25 | while (getline(infile, line)) | 25 | while (getline(infile, line)) |
| 26 | { | 26 | { |
| 27 | corpus += line + "\n "; | 27 | corpus += line + "\n "; |
| 28 | } | 28 | } |
| 29 | 29 | ||
| 30 | std::cout << "Preprocessing corpus..." << std::endl; | 30 | std::cout << "Preprocessing corpus..." << std::endl; |
| 31 | kgramstats* stats = new kgramstats(corpus, 4); | 31 | kgramstats* stats = new kgramstats(corpus, 4); |
| 32 | 32 | ||
| 33 | std::cout << "Preprocessing freevars..." << std::endl; | 33 | std::cout << "Preprocessing freevars..." << std::endl; |
| 34 | freevars* vars = new freevars(); | 34 | freevars* vars = new freevars(); |
| 35 | vars->addVar("name", "names.txt"); | 35 | vars->addVar("name", "names.txt"); |
| 36 | vars->addVar("noun", "nouns.txt"); | 36 | vars->addVar("noun", "nouns.txt"); |
| 37 | 37 | ||
| 38 | std::cout << "Generating..." << std::endl; | 38 | std::cout << "Generating..." << std::endl; |
| 39 | for (;;) | 39 | for (;;) |
| 40 | { | 40 | { |
| 41 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); | 41 | std::vector<std::string> doc = stats->randomSentence(rand() % 45 + 5); |
| 42 | std::string hi; | 42 | std::string hi; |
| 43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 43 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
| 44 | { | ||
| 45 | hi += vars->parse(*it) + " "; | ||
| 46 | } | ||
| 47 | |||
| 48 | size_t firstperiod = hi.find_first_of(".!?"); | ||
| 49 | if (firstperiod != std::string::npos) | ||
| 50 | { | 44 | { |
| 51 | hi = hi.substr(firstperiod+2); | 45 | hi += vars->parse(*it) + " "; |
| 52 | } | 46 | } |
| 53 | 47 | ||
| 54 | hi.resize(140); | 48 | hi.resize(140); |
| 55 | 49 | ||
| 56 | size_t lastperiod = hi.find_last_of(".!?"); | 50 | size_t lastperiod = hi.find_last_of(".!?,"); |
| 57 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 51 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) |
| 58 | { | 52 | { |
| 59 | hi = hi.substr(0, lastperiod+1); | 53 | hi = hi.substr(0, lastperiod+1); |
| 60 | } | 54 | } |
| 61 | 55 | ||
| 62 | twitCurl twitterObj; | 56 | twitCurl twitterObj; |
| 63 | std::string tmpStr, tmpStr2; | 57 | std::string tmpStr, tmpStr2; |
| 64 | std::string replyMsg; | 58 | std::string replyMsg; |
| 65 | char tmpBuf[1024]; | 59 | char tmpBuf[1024]; |
| 66 | std::string username(config["username"].as<std::string>()); | 60 | std::string username(config["username"].as<std::string>()); |
| 67 | std::string password(config["password"].as<std::string>()); | 61 | std::string password(config["password"].as<std::string>()); |
| 68 | 62 | ||
| 69 | /* Set twitter username and password */ | 63 | /* Set twitter username and password */ |
| 70 | twitterObj.setTwitterUsername(username); | 64 | twitterObj.setTwitterUsername(username); |
| 71 | twitterObj.setTwitterPassword(password); | 65 | twitterObj.setTwitterPassword(password); |
| 72 | 66 | ||
| 73 | /* OAuth flow begins */ | 67 | /* OAuth flow begins */ |
| 74 | /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ | 68 | /* Step 0: Set OAuth related params. These are got by registering your app at twitter.com */ |
| 75 | twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() ); | 69 | twitterObj.getOAuth().setConsumerKey( config["consumer_key"].as<std::string>() ); |
| 76 | twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() ); | 70 | twitterObj.getOAuth().setConsumerSecret( config["consumer_secret"].as<std::string>() ); |
| 77 | 71 | ||
| 78 | /* Step 1: Check if we alredy have OAuth access token from a previous run */ | 72 | /* Step 1: Check if we alredy have OAuth access token from a previous run */ |
| 79 | std::string myOAuthAccessTokenKey(""); | 73 | std::string myOAuthAccessTokenKey(""); |
| 80 | std::string myOAuthAccessTokenSecret(""); | 74 | std::string myOAuthAccessTokenSecret(""); |
| 81 | std::ifstream oAuthTokenKeyIn; | 75 | std::ifstream oAuthTokenKeyIn; |
| 82 | std::ifstream oAuthTokenSecretIn; | 76 | std::ifstream oAuthTokenSecretIn; |
| 83 | 77 | ||
| 84 | oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); | 78 | oAuthTokenKeyIn.open( "twitterClient_token_key.txt" ); |
| 85 | oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); | 79 | oAuthTokenSecretIn.open( "twitterClient_token_secret.txt" ); |
| 86 | 80 | ||
| 87 | memset( tmpBuf, 0, 1024 ); | 81 | memset( tmpBuf, 0, 1024 ); |
| 88 | oAuthTokenKeyIn >> tmpBuf; | 82 | oAuthTokenKeyIn >> tmpBuf; |
| 89 | myOAuthAccessTokenKey = tmpBuf; | 83 | myOAuthAccessTokenKey = tmpBuf; |
| 90 | 84 | ||
| 91 | memset( tmpBuf, 0, 1024 ); | 85 | memset( tmpBuf, 0, 1024 ); |
| 92 | oAuthTokenSecretIn >> tmpBuf; | 86 | oAuthTokenSecretIn >> tmpBuf; |
| 93 | myOAuthAccessTokenSecret = tmpBuf; | 87 | myOAuthAccessTokenSecret = tmpBuf; |
| 94 | 88 | ||
| 95 | oAuthTokenKeyIn.close(); | 89 | oAuthTokenKeyIn.close(); |
| 96 | oAuthTokenSecretIn.close(); | 90 | oAuthTokenSecretIn.close(); |
| 97 | 91 | ||
| 98 | if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) | 92 | if( myOAuthAccessTokenKey.size() && myOAuthAccessTokenSecret.size() ) |
| 99 | { | 93 | { |
| 100 | /* If we already have these keys, then no need to go through auth again */ | 94 | /* If we already have these keys, then no need to go through auth again */ |
| 101 | printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); | 95 | printf( "\nUsing:\nKey: %s\nSecret: %s\n\n", myOAuthAccessTokenKey.c_str(), myOAuthAccessTokenSecret.c_str() ); |
| 102 | 96 | ||
| 103 | twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); | 97 | twitterObj.getOAuth().setOAuthTokenKey( myOAuthAccessTokenKey ); |
| 104 | twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); | 98 | twitterObj.getOAuth().setOAuthTokenSecret( myOAuthAccessTokenSecret ); |
| 105 | } | 99 | } |
| 106 | else | 100 | else |
| 107 | { | 101 | { |
| 108 | /* Step 2: Get request token key and secret */ | 102 | /* Step 2: Get request token key and secret */ |
| 109 | std::string authUrl; | 103 | std::string authUrl; |
| 110 | twitterObj.oAuthRequestToken( authUrl ); | 104 | twitterObj.oAuthRequestToken( authUrl ); |
| 111 | 105 | ||
| 112 | /* Step 3: Get PIN */ | 106 | /* Step 3: Get PIN */ |
| 113 | memset( tmpBuf, 0, 1024 ); | 107 | memset( tmpBuf, 0, 1024 ); |
| 114 | printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); | 108 | printf( "\nDo you want to visit twitter.com for PIN (0 for no; 1 for yes): " ); |
| 115 | gets( tmpBuf ); | 109 | gets( tmpBuf ); |
| 116 | tmpStr = tmpBuf; | 110 | tmpStr = tmpBuf; |
| 117 | if( std::string::npos != tmpStr.find( "1" ) ) | 111 | if( std::string::npos != tmpStr.find( "1" ) ) |
| 118 | { | 112 | { |
| 119 | /* Ask user to visit twitter.com auth page and get PIN */ | 113 | /* Ask user to visit twitter.com auth page and get PIN */ |
| 120 | memset( tmpBuf, 0, 1024 ); | 114 | memset( tmpBuf, 0, 1024 ); |
| 121 | printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); | 115 | printf( "\nPlease visit this link in web browser and authorize this application:\n%s", authUrl.c_str() ); |
| 122 | printf( "\nEnter the PIN provided by twitter: " ); | 116 | printf( "\nEnter the PIN provided by twitter: " ); |
| 123 | gets( tmpBuf ); | 117 | gets( tmpBuf ); |
| 124 | tmpStr = tmpBuf; | 118 | tmpStr = tmpBuf; |
| 125 | twitterObj.getOAuth().setOAuthPin( tmpStr ); | 119 | twitterObj.getOAuth().setOAuthPin( tmpStr ); |
| 126 | } | 120 | } |
| 127 | else | 121 | else |
| 128 | { | 122 | { |
| 129 | /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ | 123 | /* Else, pass auth url to twitCurl and get it via twitCurl PIN handling */ |
| 130 | twitterObj.oAuthHandlePIN( authUrl ); | 124 | twitterObj.oAuthHandlePIN( authUrl ); |
| 131 | } | 125 | } |
| 132 | 126 | ||
| 133 | /* Step 4: Exchange request token with access token */ | 127 | /* Step 4: Exchange request token with access token */ |
| 134 | twitterObj.oAuthAccessToken(); | 128 | twitterObj.oAuthAccessToken(); |
| 135 | 129 | ||
| 136 | /* Step 5: Now, save this access token key and secret for future use without PIN */ | 130 | /* Step 5: Now, save this access token key and secret for future use without PIN */ |
| 137 | twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); | 131 | twitterObj.getOAuth().getOAuthTokenKey( myOAuthAccessTokenKey ); |
| 138 | twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); | 132 | twitterObj.getOAuth().getOAuthTokenSecret( myOAuthAccessTokenSecret ); |
| 139 | 133 | ||
| 140 | /* Step 6: Save these keys in a file or wherever */ | 134 | /* Step 6: Save these keys in a file or wherever */ |
| 141 | std::ofstream oAuthTokenKeyOut; | 135 | std::ofstream oAuthTokenKeyOut; |
| 142 | std::ofstream oAuthTokenSecretOut; | 136 | std::ofstream oAuthTokenSecretOut; |
| 143 | 137 | ||
| 144 | oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); | 138 | oAuthTokenKeyOut.open( "twitterClient_token_key.txt" ); |
| 145 | oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); | 139 | oAuthTokenSecretOut.open( "twitterClient_token_secret.txt" ); |
| 146 | 140 | ||
| 147 | oAuthTokenKeyOut.clear(); | 141 | oAuthTokenKeyOut.clear(); |
| 148 | oAuthTokenSecretOut.clear(); | 142 | oAuthTokenSecretOut.clear(); |
| 149 | 143 | ||
| 150 | oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); | 144 | oAuthTokenKeyOut << myOAuthAccessTokenKey.c_str(); |
| 151 | oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); | 145 | oAuthTokenSecretOut << myOAuthAccessTokenSecret.c_str(); |
| 152 | 146 | ||
| 153 | oAuthTokenKeyOut.close(); | 147 | oAuthTokenKeyOut.close(); |
| 154 | oAuthTokenSecretOut.close(); | 148 | oAuthTokenSecretOut.close(); |
| 155 | } | 149 | } |
| 156 | /* OAuth flow ends */ | 150 | /* OAuth flow ends */ |
| 157 | 151 | ||
| 158 | /* Account credentials verification */ | 152 | /* Account credentials verification */ |
| 159 | if( twitterObj.accountVerifyCredGet() ) | 153 | if( twitterObj.accountVerifyCredGet() ) |
| 160 | { | 154 | { |
| 161 | twitterObj.getLastWebResponse( replyMsg ); | 155 | twitterObj.getLastWebResponse( replyMsg ); |
| 162 | printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); | 156 | printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet web response:\n%s\n", replyMsg.c_str() ); |
| 163 | } | 157 | } |
| 164 | else | 158 | else |
| 165 | { | 159 | { |
| 166 | twitterObj.getLastCurlError( replyMsg ); | 160 | twitterObj.getLastCurlError( replyMsg ); |
| 167 | printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); | 161 | printf( "\ntwitterClient:: twitCurl::accountVerifyCredGet error:\n%s\n", replyMsg.c_str() ); |
| 168 | } | 162 | } |
| 169 | 163 | ||
| 170 | /* Post a new status message */ | 164 | /* Post a new status message */ |
| 171 | replyMsg = ""; | 165 | replyMsg = ""; |
| 172 | if( twitterObj.statusUpdate( hi ) ) | 166 | if( twitterObj.statusUpdate( hi ) ) |
| 173 | { | 167 | { |
| 174 | twitterObj.getLastWebResponse( replyMsg ); | 168 | twitterObj.getLastWebResponse( replyMsg ); |
| 175 | printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); | 169 | printf( "\ntwitterClient:: twitCurl::statusUpdate web response:\n%s\n", replyMsg.c_str() ); |
| 176 | } | 170 | } |
| 177 | else | 171 | else |
| 178 | { | 172 | { |
| 179 | twitterObj.getLastCurlError( replyMsg ); | 173 | twitterObj.getLastCurlError( replyMsg ); |
| 180 | printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); | 174 | printf( "\ntwitterClient:: twitCurl::statusUpdate error:\n%s\n", replyMsg.c_str() ); |
| 181 | } | 175 | } |
| 182 | 176 | ||
| 183 | sleep(rand() % delay); | 177 | sleep(rand() % delay); |
| 184 | } | 178 | } |
| 185 | 179 | ||
| 186 | return 0; | 180 | return 0; |
| 187 | } | 181 | } |
| diff --git a/gen.cpp b/gen.cpp index 7e47d45..400c0a5 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -11,72 +11,66 @@ | |||
| 11 | 11 | ||
| 12 | int main(int argc, char** args) | 12 | int main(int argc, char** args) |
| 13 | { | 13 | { |
| 14 | srand(time(NULL)); | 14 | srand(time(NULL)); |
| 15 | 15 | ||
| 16 | if (argc == 1) | 16 | if (argc == 1) |
| 17 | { | 17 | { |
| 18 | std::cout << "rawr-gen, version 1.0" << std::endl; | 18 | std::cout << "rawr-gen, version 1.0" << std::endl; |
| 19 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; | 19 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; |
| 20 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; | 20 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; |
| 21 | 21 | ||
| 22 | return 0; | 22 | return 0; |
| 23 | } | 23 | } |
| 24 | 24 | ||
| 25 | std::ifstream infile(args[1]); | 25 | std::ifstream infile(args[1]); |
| 26 | if (!infile) | 26 | if (!infile) |
| 27 | { | 27 | { |
| 28 | std::cout << "rawr-gen, version 1.0" << std::endl; | 28 | std::cout << "rawr-gen, version 1.0" << std::endl; |
| 29 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; | 29 | std::cout << "Usage: rawr-gen corpus-file" << std::endl; |
| 30 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; | 30 | std::cout << " where 'corpus-file' is the path to your input" << std::endl; |
| 31 | std::cout << std::endl; | 31 | std::cout << std::endl; |
| 32 | std::cout << "The file you specified does not exist." << std::endl; | 32 | std::cout << "The file you specified does not exist." << std::endl; |
| 33 | 33 | ||
| 34 | return 0; | 34 | return 0; |
| 35 | } | 35 | } |
| 36 | 36 | ||
| 37 | std::string corpus; | 37 | std::string corpus; |
| 38 | std::string line; | 38 | std::string line; |
| 39 | while (getline(infile, line)) | 39 | while (getline(infile, line)) |
| 40 | { | 40 | { |
| 41 | corpus += line + "\n "; | 41 | corpus += line + "\n "; |
| 42 | } | 42 | } |
| 43 | 43 | ||
| 44 | std::cout << "Preprocessing corpus..." << std::endl; | 44 | std::cout << "Preprocessing corpus..." << std::endl; |
| 45 | kgramstats* stats = new kgramstats(corpus, 4); | 45 | kgramstats* stats = new kgramstats(corpus, 4); |
| 46 | 46 | ||
| 47 | std::cout << "Preprocessing freevars..." << std::endl; | 47 | std::cout << "Preprocessing freevars..." << std::endl; |
| 48 | freevars* vars = new freevars(); | 48 | freevars* vars = new freevars(); |
| 49 | vars->addVar("name", "names.txt"); | 49 | vars->addVar("name", "names.txt"); |
| 50 | vars->addVar("noun", "nouns.txt"); | 50 | vars->addVar("noun", "nouns.txt"); |
| 51 | 51 | ||
| 52 | std::cout << "Generating..." << std::endl; | 52 | std::cout << "Generating..." << std::endl; |
| 53 | for (;;) | 53 | for (;;) |
| 54 | { | 54 | { |
| 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45); | 55 | std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15); |
| 56 | std::string hi; | 56 | std::string hi; |
| 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) | 57 | for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it) |
| 58 | { | ||
| 59 | hi += vars->parse(*it) + " "; | ||
| 60 | } | ||
| 61 | |||
| 62 | size_t firstperiod = hi.find_first_of(".!?"); | ||
| 63 | if (firstperiod != std::string::npos) | ||
| 64 | { | 58 | { |
| 65 | hi = hi.substr(firstperiod+2); | 59 | hi += vars->parse(*it) + " "; |
| 66 | } | 60 | } |
| 67 | 61 | ||
| 68 | hi.resize(140); | 62 | hi.resize(140); |
| 69 | 63 | ||
| 70 | size_t lastperiod = hi.find_last_of(".!?"); | 64 | size_t lastperiod = hi.find_last_of(".!?,"); |
| 71 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) | 65 | if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) |
| 72 | { | 66 | { |
| 73 | hi = hi.substr(0, lastperiod+1); | 67 | hi = hi.substr(0, lastperiod+1); |
| 74 | } | 68 | } |
| 75 | 69 | ||
| 76 | std::cout << hi << std::endl; | 70 | std::cout << hi << std::endl; |
| 77 | 71 | ||
| 78 | getc(stdin); | 72 | getc(stdin); |
| 79 | } | 73 | } |
| 80 | 74 | ||
| 81 | return 0; | 75 | return 0; |
| 82 | } | 76 | } |
| diff --git a/kgramstats.cpp b/kgramstats.cpp index b0ec68a..c88d83c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -5,237 +5,176 @@ | |||
| 5 | #include <algorithm> | 5 | #include <algorithm> |
| 6 | #include "malaprop.h" | 6 | #include "malaprop.h" |
| 7 | 7 | ||
| 8 | query wildcardQuery(querytype_sentence); | ||
| 9 | |||
| 8 | std::string canonize(std::string f); | 10 | std::string canonize(std::string f); |
| 9 | 11 | ||
| 10 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 12 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
| 11 | // We consider maxK to be fairly constant | 13 | // We consider maxK to be fairly constant |
| 12 | kgramstats::kgramstats(std::string corpus, int maxK) | 14 | kgramstats::kgramstats(std::string corpus, int maxK) |
| 13 | { | 15 | { |
| 14 | this->maxK = maxK; | 16 | this->maxK = maxK; |
| 15 | 17 | ||
| 16 | std::vector<std::string> tokens; | 18 | std::vector<std::string> tokens; |
| 17 | size_t start = 0; | 19 | size_t start = 0; |
| 18 | int end = 0; | 20 | int end = 0; |
| 19 | 21 | ||
| 20 | while (end != std::string::npos) | 22 | while (end != std::string::npos) |
| 21 | { | 23 | { |
| 22 | end = corpus.find(" ", start); | 24 | end = corpus.find(" ", start); |
| 23 | 25 | ||
| 24 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); | 26 | std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); |
| 25 | if (token[token.length()-1] == '\n') | 27 | if (token[token.length()-1] == '\n') |
| 26 | { | 28 | { |
| 27 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) | 29 | if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ',')) |
| 28 | { | 30 | { |
| 29 | token.insert(token.length()-1, "."); | 31 | token.insert(token.length()-1, "."); |
| 30 | } | 32 | } |
| 31 | 33 | ||
| 32 | token.resize(token.length()-1); | 34 | token.resize(token.length()-1); |
| 33 | } | 35 | } |
| 34 | 36 | ||
| 35 | if (token.compare("") && token.compare(".")) | 37 | if (token.compare("") && token.compare(".")) |
| 36 | { | 38 | { |
| 37 | mstats.addWord(token); | 39 | mstats.addWord(token); |
| 38 | tokens.push_back(token); | 40 | tokens.push_back(token); |
| 39 | } | 41 | } |
| 40 | 42 | ||
| 41 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 43 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 42 | } | 44 | } |
| 43 | 45 | ||
| 44 | std::map<kgram, std::map<std::string, token_data*>* > tstats; | 46 | std::map<kgram, std::map<token, token_data> > tstats; |
| 45 | bool newSentence = true; | 47 | std::map<token, std::map<termstats, int> > tendings; |
| 46 | bool newClause = false; | 48 | for (int k=1; k<maxK; k++) |
| 47 | for (int k=0; k<maxK; k++) | 49 | { |
| 48 | { | 50 | for (int i=0; i<(tokens.size() - k); i++) |
| 49 | for (int i=0; i<(tokens.size() - k); i++) | 51 | { |
| 50 | { | 52 | std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k); |
| 51 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 53 | kgram prefix; |
| 52 | std::transform(seq.begin(), seq.end(), seq.begin(), canonize); | ||
| 53 | std::string f = tokens[i+k]; | ||
| 54 | |||
| 55 | |||
| 56 | |||
| 57 | std::string canonical = canonize(f); | ||
| 58 | |||
| 59 | if (tstats[seq] == NULL) | ||
| 60 | { | ||
| 61 | tstats[seq] = new std::map<std::string, token_data*>(); | ||
| 62 | } | ||
| 63 | |||
| 64 | if ((*tstats[seq])[canonical] == NULL) | ||
| 65 | { | ||
| 66 | (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data)); | ||
| 67 | } | ||
| 68 | |||
| 69 | token_data* td = tstats[seq]->at(canonical); | ||
| 70 | td->token = new std::string(canonical); | ||
| 71 | td->all++; | ||
| 72 | 54 | ||
| 73 | /*if (newSentence) | 55 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) |
| 74 | { | 56 | { |
| 75 | kgram newKgram(1, "."); | 57 | token word(canonize(*it)); |
| 76 | if (tstats[newKgram] == NULL) | 58 | |
| 59 | if (it->find_first_of(".?!,") != std::string::npos) | ||
| 77 | { | 60 | { |
| 78 | tstats[newKgram] = new std::map<std::string, token_data*>(); | 61 | word.terminating = true; |
| 79 | } | 62 | } |
| 80 | 63 | ||
| 81 | (*tstats[newKgram])[canonical] = td; | 64 | prefix.push_back(word); |
| 82 | |||
| 83 | newSentence = false; | ||
| 84 | } | 65 | } |
| 85 | 66 | ||
| 86 | if (newClause) | 67 | std::string f = tokens[i+k]; |
| 68 | std::string canonical = canonize(f); | ||
| 69 | |||
| 70 | token word(canonical); | ||
| 71 | if (f.find_first_of(".?!,") != std::string::npos) | ||
| 87 | { | 72 | { |
| 88 | kgram commaKgram(1, ","); | 73 | word.terminating = true; |
| 89 | if (tstats[commaKgram] == NULL) | ||
| 90 | { | ||
| 91 | tstats[commaKgram] = new std::map<std::string, token_data*>(); | ||
| 92 | } | ||
| 93 | 74 | ||
| 94 | (*tstats[commaKgram])[canonical] = td; | 75 | char terminator = f[f.find_last_of(".?!,")]; |
| 76 | int occurrences = std::count(f.begin(), f.end(), terminator); | ||
| 95 | 77 | ||
| 96 | newClause = false; | 78 | tendings[word][termstats(terminator, occurrences)]++; |
| 97 | } | ||
| 98 | |||
| 99 | if ((f.length() > 0) && (f[f.length()-1] == '\n')) | ||
| 100 | { | ||
| 101 | td->period++; | ||
| 102 | newSentence = true; | ||
| 103 | f.resize(f.length()-1); | ||
| 104 | } | 79 | } |
| 105 | 80 | ||
| 106 | if (f.length() > 0) | 81 | token_data& td = tstats[prefix][word]; |
| 82 | td.word = word; | ||
| 83 | td.all++; | ||
| 84 | |||
| 85 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
| 107 | { | 86 | { |
| 108 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | 87 | td.uppercase++; |
| 109 | { | 88 | } else if (isupper(f[0])) |
| 110 | if (!newSentence) | 89 | { |
| 111 | { | 90 | td.titlecase++; |
| 112 | td->period++; | ||
| 113 | newSentence = true; | ||
| 114 | } | ||
| 115 | |||
| 116 | f.resize(f.length()-1); | ||
| 117 | } else if (f[f.length()-1] == ',') | ||
| 118 | { | ||
| 119 | if (!newSentence) | ||
| 120 | { | ||
| 121 | td->comma++; | ||
| 122 | newClause = true; | ||
| 123 | } | ||
| 124 | |||
| 125 | f.resize(f.length()-1); | ||
| 126 | } | ||
| 127 | } | 91 | } |
| 128 | 92 | ||
| 129 | if (f.length() > 0) | 93 | if (prefix.front().word.terminating) |
| 130 | { | 94 | { |
| 131 | if (f[0] == '"') | 95 | prefix.front() = wildcardQuery; |
| 132 | { | ||
| 133 | td->startquote++; | ||
| 134 | } | ||
| 135 | 96 | ||
| 136 | if (f[0] == '(') | 97 | token_data& td2 = tstats[prefix][word]; |
| 98 | td2.word = word; | ||
| 99 | td2.all++; | ||
| 100 | |||
| 101 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
| 137 | { | 102 | { |
| 138 | td->startparen++; | 103 | td2.uppercase++; |
| 139 | } | 104 | } else if (isupper(f[0])) |
| 140 | |||
| 141 | if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')')) | ||
| 142 | { | 105 | { |
| 143 | if (f[f.length()-1] == '"') | 106 | td2.titlecase++; |
| 144 | { | ||
| 145 | td->endquote++; | ||
| 146 | } else if (f[f.length()-1] == ')') | ||
| 147 | { | ||
| 148 | td->endparen++; | ||
| 149 | } | ||
| 150 | |||
| 151 | f.resize(f.length()-1); | ||
| 152 | |||
| 153 | if (f.length() > 0) | ||
| 154 | { | ||
| 155 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | ||
| 156 | { | ||
| 157 | if (!newSentence) | ||
| 158 | { | ||
| 159 | td->period++; | ||
| 160 | newSentence = true; | ||
| 161 | } | ||
| 162 | } else if (f[f.length()-1] == ',') | ||
| 163 | { | ||
| 164 | if (!newSentence && !newClause) | ||
| 165 | { | ||
| 166 | td->comma++; | ||
| 167 | newClause = true; | ||
| 168 | } | ||
| 169 | } | ||
| 170 | } | ||
| 171 | } | ||
| 172 | }*/ | ||
| 173 | |||
| 174 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
| 175 | { | ||
| 176 | td->uppercase++; | ||
| 177 | } else if (isupper(f[0])) | ||
| 178 | { | ||
| 179 | td->titlecase++; | ||
| 180 | } | ||
| 181 | |||
| 182 | /*if (k != 0) | ||
| 183 | { | ||
| 184 | if (newSentence) | ||
| 185 | { | ||
| 186 | i += k; | ||
| 187 | } | 107 | } |
| 188 | 108 | } | |
| 189 | newSentence = false; | 109 | } |
| 190 | newClause = false; | 110 | } |
| 191 | }*/ | ||
| 192 | } | ||
| 193 | } | ||
| 194 | 111 | ||
| 195 | stats = new std::map<kgram, std::map<int, token_data*>* >(); | 112 | for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++) |
| 196 | for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++) | 113 | { |
| 197 | { | 114 | kgram klist = it->first; |
| 198 | kgram klist = it->first; | 115 | std::map<token, token_data>& probtable = it->second; |
| 199 | std::map<std::string, token_data*>* probtable = it->second; | 116 | std::map<int, token_data>& distribution = stats[klist]; |
| 200 | std::map<int, token_data*>* distribution = new std::map<int, token_data*>(); | 117 | int max = 0; |
| 201 | int max = 0; | ||
| 202 | 118 | ||
| 203 | for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++) | 119 | for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) |
| 204 | { | 120 | { |
| 205 | max += kt->second->all; | 121 | max += kt->second.all; |
| 206 | 122 | ||
| 207 | (*distribution)[max] = kt->second; | 123 | distribution[max] = kt->second; |
| 208 | } | 124 | } |
| 209 | 125 | } | |
| 210 | (*stats)[klist] = distribution; | 126 | |
| 211 | } | 127 | for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++) |
| 128 | { | ||
| 129 | token word = it->first; | ||
| 130 | std::map<termstats, int>& probtable = it->second; | ||
| 131 | std::map<int, termstats>& distribution = endings[word]; | ||
| 132 | int max = 0; | ||
| 133 | |||
| 134 | for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++) | ||
| 135 | { | ||
| 136 | max += kt->second; | ||
| 137 | |||
| 138 | distribution[max] = kt->first; | ||
| 139 | } | ||
| 140 | } | ||
| 212 | } | 141 | } |
| 213 | 142 | ||
| 214 | void printKgram(kgram k) | 143 | void printKgram(kgram k) |
| 215 | { | 144 | { |
| 216 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 145 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
| 217 | { | 146 | { |
| 218 | std::cout << *it << " "; | 147 | query& q = *it; |
| 219 | } | 148 | if (q.type == querytype_sentence) |
| 149 | { | ||
| 150 | std::cout << "#.# "; | ||
| 151 | } else if (q.type == querytype_literal) | ||
| 152 | { | ||
| 153 | if (q.word.terminating) | ||
| 154 | { | ||
| 155 | std::cout << q.word.canon << ". "; | ||
| 156 | } else { | ||
| 157 | std::cout << q.word.canon << " "; | ||
| 158 | } | ||
| 159 | } | ||
| 160 | } | ||
| 220 | } | 161 | } |
| 221 | 162 | ||
| 222 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 163 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
| 223 | std::vector<std::string> kgramstats::randomSentence(int n) | 164 | std::vector<std::string> kgramstats::randomSentence(int n) |
| 224 | { | 165 | { |
| 225 | std::vector<std::string> result; | 166 | std::vector<std::string> result; |
| 226 | kgram newKgram(1, "."); | 167 | kgram cur(1, wildcardQuery); |
| 227 | kgram commaKgram(1, ","); | ||
| 228 | std::list<std::string> cur; | ||
| 229 | int cuts = 0; | 168 | int cuts = 0; |
| 230 | 169 | ||
| 231 | for (int i=0; i<n; i++) | 170 | for (int i=0; i<n; i++) |
| 232 | { | 171 | { |
| 233 | if (cur.size() == maxK) | 172 | if (cur.size() == maxK) |
| 234 | { | 173 | { |
| 235 | cur.pop_front(); | 174 | cur.pop_front(); |
| 236 | } | 175 | } |
| 237 | 176 | ||
| 238 | if ((cur.size() > 0) && (cur != newKgram)) | 177 | if (cur.size() > 0) |
| 239 | { | 178 | { |
| 240 | if (rand() % (maxK - cur.size() + 1) == 0) | 179 | if (rand() % (maxK - cur.size() + 1) == 0) |
| 241 | { | 180 | { |
| @@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 253 | 192 | ||
| 254 | cuts++; | 193 | cuts++; |
| 255 | } | 194 | } |
| 195 | |||
| 196 | // Gotta circumvent the last line of the input corpus | ||
| 197 | // https://twitter.com/starla4444/status/684222271339237376 | ||
| 198 | if (stats.count(cur) == 0) | ||
| 199 | { | ||
| 200 | cur = kgram(1, wildcardQuery); | ||
| 201 | } | ||
| 256 | 202 | ||
| 257 | std::map<int, token_data*> distribution = *(*stats)[cur]; | 203 | std::map<int, token_data>& distribution = stats[cur]; |
| 258 | int max = distribution.rbegin()->first; | 204 | int max = distribution.rbegin()->first; |
| 259 | int r = rand() % max; | 205 | int r = rand() % max; |
| 260 | token_data* next = distribution.upper_bound(r)->second; | 206 | token_data& next = distribution.upper_bound(r)->second; |
| 261 | 207 | std::string nextToken(next.word.canon); | |
| 262 | std::string nextToken(*(next->token)); | ||
| 263 | int casing = rand() % next->all; | ||
| 264 | /*int period = rand() % next->all; | ||
| 265 | int startparen = rand() % next->all; | ||
| 266 | int endparen = rand() % next->all; | ||
| 267 | int startquote = rand() % next->all; | ||
| 268 | int endquote = rand() % next->all; | ||
| 269 | int comma = rand() % next->all;*/ | ||
| 270 | 208 | ||
| 271 | bool mess = (rand() % 100) == 0; | 209 | bool mess = (rand() % 100) == 0; |
| 272 | if (mess) | 210 | if (mess) |
| @@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 274 | nextToken = mstats.alternate(nextToken); | 212 | nextToken = mstats.alternate(nextToken); |
| 275 | } | 213 | } |
| 276 | 214 | ||
| 277 | if (casing < next->uppercase) | 215 | // Determine the casing of the next token. We randomly make the token all |
| 278 | { | 216 | // caps based on the markov chain. Otherwise, we check if the previous |
| 279 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 217 | // token is the end of a sentence (terminating token or a wildcard query). |
| 280 | } | 218 | int casing = rand() % next.all; |
| 281 | 219 | if (casing < next.uppercase) | |
| 282 | if ((cur == newKgram) && (rand() % 15 > 0)) | 220 | { |
| 221 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
| 222 | } else if ((((cur.rbegin()->type == querytype_sentence) | ||
| 223 | || ((cur.rbegin()->type == querytype_literal) | ||
| 224 | && (cur.rbegin()->word.terminating))) | ||
| 225 | && (rand() % 2 > 0)) | ||
| 226 | || (casing - next.uppercase < next.titlecase)) | ||
| 283 | { | 227 | { |
| 284 | nextToken[0] = toupper(nextToken[0]); | 228 | nextToken[0] = toupper(nextToken[0]); |
| 285 | } | 229 | } |
| 286 | 230 | ||
| 287 | /*if (startquote < next->startquote) | 231 | if (next.word.terminating) |
| 288 | { | ||
| 289 | nextToken = "\"" + nextToken; | ||
| 290 | } else if (startparen < next->startparen) | ||
| 291 | { | 232 | { |
| 292 | nextToken = "(" + nextToken; | 233 | std::map<int, termstats>& ending = endings[next.word]; |
| 234 | int emax = ending.rbegin()->first; | ||
| 235 | int er = rand() % emax; | ||
| 236 | termstats& nextend = ending.upper_bound(er)->second; | ||
| 237 | |||
| 238 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | ||
| 293 | } | 239 | } |
| 294 | |||
| 295 | if (period < next->period) | ||
| 296 | { | ||
| 297 | if (endquote < next->endquote) | ||
| 298 | { | ||
| 299 | nextToken += "\""; | ||
| 300 | } else if (endparen < next->endparen) | ||
| 301 | { | ||
| 302 | nextToken += ")"; | ||
| 303 | } | ||
| 304 | |||
| 305 | int type = rand() % 6; | ||
| 306 | |||
| 307 | if (type < 3) | ||
| 308 | { | ||
| 309 | nextToken += "."; | ||
| 310 | } else if (type < 5) | ||
| 311 | { | ||
| 312 | nextToken += "!"; | ||
| 313 | } else { | ||
| 314 | nextToken += "?"; | ||
| 315 | } | ||
| 316 | } else if (comma < next->comma) | ||
| 317 | { | ||
| 318 | if (endquote < next->endquote) | ||
| 319 | { | ||
| 320 | nextToken += "\""; | ||
| 321 | } else if (endparen < next->endparen) | ||
| 322 | { | ||
| 323 | nextToken += ")"; | ||
| 324 | } | ||
| 325 | |||
| 326 | nextToken += ","; | ||
| 327 | }*/ | ||
| 328 | 240 | ||
| 329 | /* DEBUG */ | 241 | /* DEBUG */ |
| 330 | for (kgram::iterator it = cur.begin(); it != cur.end(); it++) | 242 | printKgram(cur); |
| 331 | { | ||
| 332 | std::cout << *it << " "; | ||
| 333 | } | ||
| 334 | 243 | ||
| 335 | std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")"; | 244 | std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")"; |
| 336 | 245 | ||
| 337 | if (mess) | 246 | if (mess) |
| 338 | { | 247 | { |
| 339 | std::cout << " mala " << *(next->token); | 248 | std::cout << " mala " << next.word.canon; |
| 340 | } | 249 | } |
| 341 | 250 | ||
| 342 | std::cout << std::endl; | 251 | std::cout << std::endl; |
| 343 | 252 | ||
| 344 | /*if ((cur == newKgram) || (cur == commaKgram)) | 253 | cur.push_back(next.word); |
| 345 | { | ||
| 346 | cur.pop_front(); | ||
| 347 | } | ||
| 348 | |||
| 349 | if (period < next->period)// && ((rand() % 3) != 0)) | ||
| 350 | { | ||
| 351 | cur = newKgram; | ||
| 352 | } else if ((comma < next->comma) && ((rand() % 3) == 0)) | ||
| 353 | { | ||
| 354 | cur = commaKgram; | ||
| 355 | } else {*/ | ||
| 356 | //if (mess && (rand() % 2 == 0)) | ||
| 357 | if (false) | ||
| 358 | { | ||
| 359 | // This doesn't work because sometimes the alternate token isn't actually present in the original corpus | ||
| 360 | cur.clear(); | ||
| 361 | cur.push_back(nextToken); | ||
| 362 | } else { | ||
| 363 | cur.push_back(*(next->token)); | ||
| 364 | } | ||
| 365 | //} | ||
| 366 | 254 | ||
| 367 | result.push_back(nextToken); | 255 | result.push_back(nextToken); |
| 368 | } | 256 | } |
| 369 | 257 | ||
| 370 | return result; | 258 | return result; |
| 371 | } | 259 | } |
| 372 | 260 | ||
| 373 | bool removeIf(char c) | 261 | bool removeIf(char c) |
| 374 | { | 262 | { |
| 375 | return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); | 263 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/); |
| 376 | } | 264 | } |
| 377 | 265 | ||
| 378 | std::string canonize(std::string f) | 266 | std::string canonize(std::string f) |
| 379 | { | 267 | { |
| 380 | std::string canonical(f); | 268 | std::string canonical(f); |
| 381 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | 269 | std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); |
| 382 | 270 | ||
| 383 | std::string result; | 271 | std::string result; |
| 384 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); | 272 | std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); |
| 385 | 273 | ||
| 386 | return canonical; | 274 | return result; |
| 387 | } | 275 | } |
| diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -7,7 +7,71 @@ | |||
| 7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 9 | 9 | ||
| 10 | typedef std::list<std::string> kgram; | 10 | struct token { |
| 11 | std::string canon; | ||
| 12 | bool terminating; | ||
| 13 | |||
| 14 | token(std::string canon) : canon(canon), terminating(false) {} | ||
| 15 | |||
| 16 | bool operator<(const token& other) const | ||
| 17 | { | ||
| 18 | if (canon == other.canon) | ||
| 19 | { | ||
| 20 | return !terminating && other.terminating; | ||
| 21 | } else { | ||
| 22 | return canon < other.canon; | ||
| 23 | } | ||
| 24 | } | ||
| 25 | }; | ||
| 26 | |||
| 27 | enum querytype { | ||
| 28 | querytype_literal, | ||
| 29 | querytype_sentence | ||
| 30 | }; | ||
| 31 | |||
| 32 | struct query { | ||
| 33 | querytype type; | ||
| 34 | token word; | ||
| 35 | |||
| 36 | query(token word) : word(word), type(querytype_literal) {} | ||
| 37 | |||
| 38 | query(querytype type) : word(""), type(type) {} | ||
| 39 | |||
| 40 | bool operator<(const query& other) const | ||
| 41 | { | ||
| 42 | if (type == other.type) | ||
| 43 | { | ||
| 44 | return word < other.word; | ||
| 45 | } else { | ||
| 46 | return type < other.type; | ||
| 47 | } | ||
| 48 | } | ||
| 49 | }; | ||
| 50 | |||
| 51 | typedef std::list<query> kgram; | ||
| 52 | |||
| 53 | struct termstats { | ||
| 54 | char terminator; | ||
| 55 | int occurrences; | ||
| 56 | |||
| 57 | termstats() : terminator('.'), occurrences(1) {} | ||
| 58 | |||
| 59 | termstats(char terminator, int occurrences) | ||
| 60 | { | ||
| 61 | this->terminator = terminator; | ||
| 62 | this->occurrences = occurrences; | ||
| 63 | } | ||
| 64 | |||
| 65 | bool operator<(const termstats& other) const | ||
| 66 | { | ||
| 67 | if (terminator == other.terminator) | ||
| 68 | { | ||
| 69 | return occurrences < other.occurrences; | ||
| 70 | } else { | ||
| 71 | return terminator < other.terminator; | ||
| 72 | } | ||
| 73 | } | ||
| 74 | }; | ||
| 11 | 75 | ||
| 12 | class kgramstats | 76 | class kgramstats |
| 13 | { | 77 | { |
| @@ -16,22 +80,20 @@ public: | |||
| 16 | std::vector<std::string> randomSentence(int n); | 80 | std::vector<std::string> randomSentence(int n); |
| 17 | 81 | ||
| 18 | private: | 82 | private: |
| 19 | typedef struct | 83 | struct token_data |
| 20 | { | 84 | { |
| 21 | int all; | 85 | int all; |
| 22 | int titlecase; | 86 | int titlecase; |
| 23 | int uppercase; | 87 | int uppercase; |
| 24 | int period; | 88 | token word; |
| 25 | int startquote; | 89 | |
| 26 | int endquote; | 90 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} |
| 27 | int startparen; | 91 | }; |
| 28 | int endparen; | 92 | |
| 29 | int comma; | ||
| 30 | std::string* token; | ||
| 31 | } token_data; | ||
| 32 | int maxK; | 93 | int maxK; |
| 33 | std::map<kgram, std::map<int, token_data*>* >* stats; | 94 | std::map<kgram, std::map<int, token_data> > stats; |
| 34 | malaprop mstats; | 95 | malaprop mstats; |
| 96 | std::map<token, std::map<int, termstats> > endings; | ||
| 35 | }; | 97 | }; |
| 36 | 98 | ||
| 37 | void printKgram(kgram k); | 99 | void printKgram(kgram k); |
| diff --git a/malaprop.cpp b/malaprop.cpp index 7fbdb6c..ccdd4c4 100644 --- a/malaprop.cpp +++ b/malaprop.cpp | |||
| @@ -117,6 +117,11 @@ std::string malaprop::alternate(std::string word) | |||
| 117 | { | 117 | { |
| 118 | soundex ex = soundify(word); | 118 | soundex ex = soundify(word); |
| 119 | std::set<std::string>& opts = dict[ex]; | 119 | std::set<std::string>& opts = dict[ex]; |
| 120 | if (opts.size() == 0) | ||
| 121 | { | ||
| 122 | return word; | ||
| 123 | } | ||
| 124 | |||
| 120 | int opt = rand() % opts.size(); | 125 | int opt = rand() % opts.size(); |
| 121 | for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++) | 126 | for (std::set<std::string>::iterator it = opts.begin(); it != opts.end(); it++) |
| 122 | { | 127 | { |
