diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-25 09:13:14 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-25 09:13:14 -0500 |
| commit | 1d15f748200f093d869c6fcc38d6053903ff5062 (patch) | |
| tree | 3b980a35fa2c0ef2e206786ab40d3431fe8c2ec2 | |
| parent | 5151a32192f3efdf3efb6c54f703a8f9ba83335a (diff) | |
| download | rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.tar.gz rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.tar.bz2 rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.zip | |
hashtags are now randomized
| -rw-r--r-- | kgramstats.cpp | 115 | ||||
| -rw-r--r-- | kgramstats.h | 24 |
2 files changed, 102 insertions, 37 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence); | |||
| 43 | 43 | ||
| 44 | std::string canonize(std::string f); | 44 | std::string canonize(std::string f); |
| 45 | 45 | ||
| 46 | token token_from_string(std::string in) | ||
| 47 | { | ||
| 48 | if (in[0] == '#') | ||
| 49 | { | ||
| 50 | token word(tokentype_hashtag); | ||
| 51 | |||
| 52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
| 53 | { | ||
| 54 | word.terminating = true; | ||
| 55 | } | ||
| 56 | |||
| 57 | return word; | ||
| 58 | } else { | ||
| 59 | token word(canonize(in)); | ||
| 60 | |||
| 61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
| 62 | { | ||
| 63 | word.terminating = true; | ||
| 64 | } | ||
| 65 | |||
| 66 | return word; | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 46 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 70 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
| 47 | // We consider maxK to be fairly constant | 71 | // We consider maxK to be fairly constant |
| 48 | kgramstats::kgramstats(std::string corpus, int maxK) | 72 | kgramstats::kgramstats(std::string corpus, int maxK) |
| @@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 52 | std::vector<std::string> tokens; | 76 | std::vector<std::string> tokens; |
| 53 | size_t start = 0; | 77 | size_t start = 0; |
| 54 | int end = 0; | 78 | int end = 0; |
| 79 | std::set<std::string> thashtags; | ||
| 55 | 80 | ||
| 56 | while (end != std::string::npos) | 81 | while (end != std::string::npos) |
| 57 | { | 82 | { |
| @@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 72 | { | 97 | { |
| 73 | mstats.addWord(token); | 98 | mstats.addWord(token); |
| 74 | tokens.push_back(token); | 99 | tokens.push_back(token); |
| 100 | |||
| 101 | if (token[0] == '#') | ||
| 102 | { | ||
| 103 | thashtags.insert(canonize(token)); | ||
| 104 | } | ||
| 75 | } | 105 | } |
| 76 | 106 | ||
| 77 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 78 | } | 108 | } |
| 109 | |||
| 110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | ||
| 111 | { | ||
| 112 | hashtags.push_back(*it); | ||
| 113 | } | ||
| 79 | 114 | ||
| 80 | std::map<kgram, std::map<token, token_data> > tstats; | 115 | std::map<kgram, std::map<token, token_data> > tstats; |
| 81 | std::map<token, std::map<termstats, int> > tendings; | 116 | std::map<token, std::map<termstats, int> > tendings; |
| @@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 88 | 123 | ||
| 89 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) |
| 90 | { | 125 | { |
| 91 | token word(canonize(*it)); | 126 | prefix.push_back(token_from_string(*it)); |
| 92 | |||
| 93 | if (it->find_first_of(".?!,") != std::string::npos) | ||
| 94 | { | ||
| 95 | word.terminating = true; | ||
| 96 | } | ||
| 97 | |||
| 98 | prefix.push_back(word); | ||
| 99 | } | 127 | } |
| 100 | 128 | ||
| 101 | std::string f = tokens[i+k]; | 129 | std::string f = tokens[i+k]; |
| 102 | std::string canonical = canonize(f); | 130 | std::string canonical = canonize(f); |
| 103 | 131 | ||
| 104 | token word(canonical); | 132 | token word(token_from_string(canonical)); |
| 105 | if (f.find_first_of(".?!,") != std::string::npos) | 133 | if (f.find_first_of(".?!,") != std::string::npos) |
| 106 | { | 134 | { |
| 107 | word.terminating = true; | 135 | word.terminating = true; |
| @@ -184,11 +212,22 @@ void printKgram(kgram k) | |||
| 184 | std::cout << "#.# "; | 212 | std::cout << "#.# "; |
| 185 | } else if (q.type == querytype_literal) | 213 | } else if (q.type == querytype_literal) |
| 186 | { | 214 | { |
| 187 | if (q.word.terminating) | 215 | if (q.word.type == tokentype_hashtag) |
| 216 | { | ||
| 217 | if (q.word.terminating) | ||
| 218 | { | ||
| 219 | std::cout << "#hashtag. "; | ||
| 220 | } else { | ||
| 221 | std::cout << "#hashtag "; | ||
| 222 | } | ||
| 223 | } else if (q.word.type == tokentype_literal) | ||
| 188 | { | 224 | { |
| 189 | std::cout << q.word.canon << ". "; | 225 | if (q.word.terminating) |
| 190 | } else { | 226 | { |
| 191 | std::cout << q.word.canon << " "; | 227 | std::cout << q.word.canon << ". "; |
| 228 | } else { | ||
| 229 | std::cout << q.word.canon << " "; | ||
| 230 | } | ||
| 192 | } | 231 | } |
| 193 | } | 232 | } |
| 194 | } | 233 | } |
| @@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
| 238 | int max = distribution.rbegin()->first; | 277 | int max = distribution.rbegin()->first; |
| 239 | int r = rand() % max; | 278 | int r = rand() % max; |
| 240 | token_data& next = distribution.upper_bound(r)->second; | 279 | token_data& next = distribution.upper_bound(r)->second; |
| 241 | std::string nextToken(next.word.canon); | 280 | std::string nextToken; |
| 281 | bool mess = false; | ||
| 242 | 282 | ||
| 243 | bool mess = (rand() % 100) == 0; | 283 | if (next.word.type == tokentype_literal) |
| 244 | if (mess) | ||
| 245 | { | 284 | { |
| 246 | nextToken = mstats.alternate(nextToken); | 285 | nextToken = next.word.canon; |
| 247 | } | ||
| 248 | 286 | ||
| 249 | // Determine the casing of the next token. We randomly make the token all | 287 | mess = (rand() % 100) == 0; |
| 250 | // caps based on the markov chain. Otherwise, we check if the previous | 288 | if (mess) |
| 251 | // token is the end of a sentence (terminating token or a wildcard query). | 289 | { |
| 252 | int casing = rand() % next.all; | 290 | nextToken = mstats.alternate(nextToken); |
| 253 | if (casing < next.uppercase) | 291 | } |
| 254 | { | 292 | |
| 255 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 293 | // Determine the casing of the next token. We randomly make the token all |
| 256 | } else if ((((cur.rbegin()->type == querytype_sentence) | 294 | // caps based on the markov chain. Otherwise, we check if the previous |
| 257 | || ((cur.rbegin()->type == querytype_literal) | 295 | // token is the end of a sentence (terminating token or a wildcard query). |
| 258 | && (cur.rbegin()->word.terminating))) | 296 | int casing = rand() % next.all; |
| 259 | && (rand() % 2 > 0)) | 297 | if (casing < next.uppercase) |
| 260 | || (casing - next.uppercase < next.titlecase)) | 298 | { |
| 299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
| 300 | } else if ((((cur.rbegin()->type == querytype_sentence) | ||
| 301 | || ((cur.rbegin()->type == querytype_literal) | ||
| 302 | && (cur.rbegin()->word.terminating))) | ||
| 303 | && (rand() % 2 > 0)) | ||
| 304 | || (casing - next.uppercase < next.titlecase)) | ||
| 305 | { | ||
| 306 | nextToken[0] = toupper(nextToken[0]); | ||
| 307 | } | ||
| 308 | } else if (next.word.type == tokentype_hashtag) | ||
| 261 | { | 309 | { |
| 262 | nextToken[0] = toupper(nextToken[0]); | 310 | int rhash = rand() % hashtags.size(); |
| 311 | nextToken = hashtags[rhash]; | ||
| 263 | } | 312 | } |
| 264 | 313 | ||
| 265 | if (next.word.terminating) | 314 | if (next.word.terminating) |
| 266 | { | 315 | { |
| 267 | std::map<int, termstats>& ending = endings[next.word]; | 316 | std::map<int, termstats>& ending = endings[next.word]; |
| 268 | int emax = ending.rbegin()->first; | 317 | int emax = ending.rbegin()->first; |
| 269 | int er = rand() % emax; | 318 | int er = rand() % emax; |
| 270 | termstats& nextend = ending.upper_bound(er)->second; | 319 | termstats& nextend = ending.upper_bound(er)->second; |
| 271 | 320 | ||
| 272 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | 321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); |
| 273 | } | 322 | } |
| 274 | 323 | ||
| diff --git a/kgramstats.h b/kgramstats.h index ca61df7..ff2fc66 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -7,19 +7,34 @@ | |||
| 7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
| 8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
| 9 | 9 | ||
| 10 | enum tokentype { | ||
| 11 | tokentype_literal, | ||
| 12 | tokentype_hashtag | ||
| 13 | }; | ||
| 14 | |||
| 10 | struct token { | 15 | struct token { |
| 16 | tokentype type; | ||
| 11 | std::string canon; | 17 | std::string canon; |
| 12 | bool terminating; | 18 | bool terminating; |
| 13 | 19 | ||
| 14 | token(std::string canon) : canon(canon), terminating(false) {} | 20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} |
| 21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | ||
| 15 | 22 | ||
| 16 | bool operator<(const token& other) const | 23 | bool operator<(const token& other) const |
| 17 | { | 24 | { |
| 18 | if (canon == other.canon) | 25 | if (type != other.type) |
| 19 | { | 26 | { |
| 20 | return !terminating && other.terminating; | 27 | return type < other.type; |
| 28 | } else if (type == tokentype_literal) | ||
| 29 | { | ||
| 30 | if (canon == other.canon) | ||
| 31 | { | ||
| 32 | return !terminating && other.terminating; | ||
| 33 | } else { | ||
| 34 | return canon < other.canon; | ||
| 35 | } | ||
| 21 | } else { | 36 | } else { |
| 22 | return canon < other.canon; | 37 | return !terminating && other.terminating; |
| 23 | } | 38 | } |
| 24 | } | 39 | } |
| 25 | }; | 40 | }; |
| @@ -94,6 +109,7 @@ private: | |||
| 94 | std::map<kgram, std::map<int, token_data> > stats; | 109 | std::map<kgram, std::map<int, token_data> > stats; |
| 95 | malaprop mstats; | 110 | malaprop mstats; |
| 96 | std::map<token, std::map<int, termstats> > endings; | 111 | std::map<token, std::map<int, termstats> > endings; |
| 112 | std::vector<std::string> hashtags; | ||
| 97 | }; | 113 | }; |
| 98 | 114 | ||
| 99 | void printKgram(kgram k); | 115 | void printKgram(kgram k); |
