diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-25 09:13:14 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-25 09:13:14 -0500 |
commit | 1d15f748200f093d869c6fcc38d6053903ff5062 (patch) | |
tree | 3b980a35fa2c0ef2e206786ab40d3431fe8c2ec2 /kgramstats.cpp | |
parent | 5151a32192f3efdf3efb6c54f703a8f9ba83335a (diff) | |
download | rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.tar.gz rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.tar.bz2 rawr-ebooks-1d15f748200f093d869c6fcc38d6053903ff5062.zip |
hashtags are now randomized
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 115 |
1 files changed, 82 insertions, 33 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence); | |||
43 | 43 | ||
44 | std::string canonize(std::string f); | 44 | std::string canonize(std::string f); |
45 | 45 | ||
46 | token token_from_string(std::string in) | ||
47 | { | ||
48 | if (in[0] == '#') | ||
49 | { | ||
50 | token word(tokentype_hashtag); | ||
51 | |||
52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
53 | { | ||
54 | word.terminating = true; | ||
55 | } | ||
56 | |||
57 | return word; | ||
58 | } else { | ||
59 | token word(canonize(in)); | ||
60 | |||
61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
62 | { | ||
63 | word.terminating = true; | ||
64 | } | ||
65 | |||
66 | return word; | ||
67 | } | ||
68 | } | ||
69 | |||
46 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 70 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
47 | // We consider maxK to be fairly constant | 71 | // We consider maxK to be fairly constant |
48 | kgramstats::kgramstats(std::string corpus, int maxK) | 72 | kgramstats::kgramstats(std::string corpus, int maxK) |
@@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
52 | std::vector<std::string> tokens; | 76 | std::vector<std::string> tokens; |
53 | size_t start = 0; | 77 | size_t start = 0; |
54 | int end = 0; | 78 | int end = 0; |
79 | std::set<std::string> thashtags; | ||
55 | 80 | ||
56 | while (end != std::string::npos) | 81 | while (end != std::string::npos) |
57 | { | 82 | { |
@@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
72 | { | 97 | { |
73 | mstats.addWord(token); | 98 | mstats.addWord(token); |
74 | tokens.push_back(token); | 99 | tokens.push_back(token); |
100 | |||
101 | if (token[0] == '#') | ||
102 | { | ||
103 | thashtags.insert(canonize(token)); | ||
104 | } | ||
75 | } | 105 | } |
76 | 106 | ||
77 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
78 | } | 108 | } |
109 | |||
110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | ||
111 | { | ||
112 | hashtags.push_back(*it); | ||
113 | } | ||
79 | 114 | ||
80 | std::map<kgram, std::map<token, token_data> > tstats; | 115 | std::map<kgram, std::map<token, token_data> > tstats; |
81 | std::map<token, std::map<termstats, int> > tendings; | 116 | std::map<token, std::map<termstats, int> > tendings; |
@@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
88 | 123 | ||
89 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) |
90 | { | 125 | { |
91 | token word(canonize(*it)); | 126 | prefix.push_back(token_from_string(*it)); |
92 | |||
93 | if (it->find_first_of(".?!,") != std::string::npos) | ||
94 | { | ||
95 | word.terminating = true; | ||
96 | } | ||
97 | |||
98 | prefix.push_back(word); | ||
99 | } | 127 | } |
100 | 128 | ||
101 | std::string f = tokens[i+k]; | 129 | std::string f = tokens[i+k]; |
102 | std::string canonical = canonize(f); | 130 | std::string canonical = canonize(f); |
103 | 131 | ||
104 | token word(canonical); | 132 | token word(token_from_string(canonical)); |
105 | if (f.find_first_of(".?!,") != std::string::npos) | 133 | if (f.find_first_of(".?!,") != std::string::npos) |
106 | { | 134 | { |
107 | word.terminating = true; | 135 | word.terminating = true; |
@@ -184,11 +212,22 @@ void printKgram(kgram k) | |||
184 | std::cout << "#.# "; | 212 | std::cout << "#.# "; |
185 | } else if (q.type == querytype_literal) | 213 | } else if (q.type == querytype_literal) |
186 | { | 214 | { |
187 | if (q.word.terminating) | 215 | if (q.word.type == tokentype_hashtag) |
216 | { | ||
217 | if (q.word.terminating) | ||
218 | { | ||
219 | std::cout << "#hashtag. "; | ||
220 | } else { | ||
221 | std::cout << "#hashtag "; | ||
222 | } | ||
223 | } else if (q.word.type == tokentype_literal) | ||
188 | { | 224 | { |
189 | std::cout << q.word.canon << ". "; | 225 | if (q.word.terminating) |
190 | } else { | 226 | { |
191 | std::cout << q.word.canon << " "; | 227 | std::cout << q.word.canon << ". "; |
228 | } else { | ||
229 | std::cout << q.word.canon << " "; | ||
230 | } | ||
192 | } | 231 | } |
193 | } | 232 | } |
194 | } | 233 | } |
@@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
238 | int max = distribution.rbegin()->first; | 277 | int max = distribution.rbegin()->first; |
239 | int r = rand() % max; | 278 | int r = rand() % max; |
240 | token_data& next = distribution.upper_bound(r)->second; | 279 | token_data& next = distribution.upper_bound(r)->second; |
241 | std::string nextToken(next.word.canon); | 280 | std::string nextToken; |
281 | bool mess = false; | ||
242 | 282 | ||
243 | bool mess = (rand() % 100) == 0; | 283 | if (next.word.type == tokentype_literal) |
244 | if (mess) | ||
245 | { | 284 | { |
246 | nextToken = mstats.alternate(nextToken); | 285 | nextToken = next.word.canon; |
247 | } | ||
248 | 286 | ||
249 | // Determine the casing of the next token. We randomly make the token all | 287 | mess = (rand() % 100) == 0; |
250 | // caps based on the markov chain. Otherwise, we check if the previous | 288 | if (mess) |
251 | // token is the end of a sentence (terminating token or a wildcard query). | 289 | { |
252 | int casing = rand() % next.all; | 290 | nextToken = mstats.alternate(nextToken); |
253 | if (casing < next.uppercase) | 291 | } |
254 | { | 292 | |
255 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 293 | // Determine the casing of the next token. We randomly make the token all |
256 | } else if ((((cur.rbegin()->type == querytype_sentence) | 294 | // caps based on the markov chain. Otherwise, we check if the previous |
257 | || ((cur.rbegin()->type == querytype_literal) | 295 | // token is the end of a sentence (terminating token or a wildcard query). |
258 | && (cur.rbegin()->word.terminating))) | 296 | int casing = rand() % next.all; |
259 | && (rand() % 2 > 0)) | 297 | if (casing < next.uppercase) |
260 | || (casing - next.uppercase < next.titlecase)) | 298 | { |
299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
300 | } else if ((((cur.rbegin()->type == querytype_sentence) | ||
301 | || ((cur.rbegin()->type == querytype_literal) | ||
302 | && (cur.rbegin()->word.terminating))) | ||
303 | && (rand() % 2 > 0)) | ||
304 | || (casing - next.uppercase < next.titlecase)) | ||
305 | { | ||
306 | nextToken[0] = toupper(nextToken[0]); | ||
307 | } | ||
308 | } else if (next.word.type == tokentype_hashtag) | ||
261 | { | 309 | { |
262 | nextToken[0] = toupper(nextToken[0]); | 310 | int rhash = rand() % hashtags.size(); |
311 | nextToken = hashtags[rhash]; | ||
263 | } | 312 | } |
264 | 313 | ||
265 | if (next.word.terminating) | 314 | if (next.word.terminating) |
266 | { | 315 | { |
267 | std::map<int, termstats>& ending = endings[next.word]; | 316 | std::map<int, termstats>& ending = endings[next.word]; |
268 | int emax = ending.rbegin()->first; | 317 | int emax = ending.rbegin()->first; |
269 | int er = rand() % emax; | 318 | int er = rand() % emax; |
270 | termstats& nextend = ending.upper_bound(er)->second; | 319 | termstats& nextend = ending.upper_bound(er)->second; |
271 | 320 | ||
272 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | 321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); |
273 | } | 322 | } |
274 | 323 | ||