diff options
-rw-r--r-- | kgramstats.cpp | 115 | ||||
-rw-r--r-- | kgramstats.h | 24 |
2 files changed, 102 insertions, 37 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence); | |||
43 | 43 | ||
44 | std::string canonize(std::string f); | 44 | std::string canonize(std::string f); |
45 | 45 | ||
46 | token token_from_string(std::string in) | ||
47 | { | ||
48 | if (in[0] == '#') | ||
49 | { | ||
50 | token word(tokentype_hashtag); | ||
51 | |||
52 | if (in.find_first_of(".?!,") != std::string::npos) | ||
53 | { | ||
54 | word.terminating = true; | ||
55 | } | ||
56 | |||
57 | return word; | ||
58 | } else { | ||
59 | token word(canonize(in)); | ||
60 | |||
61 | if (in.find_first_of(".?!,") != std::string::npos) | ||
62 | { | ||
63 | word.terminating = true; | ||
64 | } | ||
65 | |||
66 | return word; | ||
67 | } | ||
68 | } | ||
69 | |||
46 | // runs in O(t^2) time where t is the number of tokens in the input corpus | 70 | // runs in O(t^2) time where t is the number of tokens in the input corpus |
47 | // We consider maxK to be fairly constant | 71 | // We consider maxK to be fairly constant |
48 | kgramstats::kgramstats(std::string corpus, int maxK) | 72 | kgramstats::kgramstats(std::string corpus, int maxK) |
@@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
52 | std::vector<std::string> tokens; | 76 | std::vector<std::string> tokens; |
53 | size_t start = 0; | 77 | size_t start = 0; |
54 | int end = 0; | 78 | int end = 0; |
79 | std::set<std::string> thashtags; | ||
55 | 80 | ||
56 | while (end != std::string::npos) | 81 | while (end != std::string::npos) |
57 | { | 82 | { |
@@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
72 | { | 97 | { |
73 | mstats.addWord(token); | 98 | mstats.addWord(token); |
74 | tokens.push_back(token); | 99 | tokens.push_back(token); |
100 | |||
101 | if (token[0] == '#') | ||
102 | { | ||
103 | thashtags.insert(canonize(token)); | ||
104 | } | ||
75 | } | 105 | } |
76 | 106 | ||
77 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 107 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
78 | } | 108 | } |
109 | |||
110 | for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++) | ||
111 | { | ||
112 | hashtags.push_back(*it); | ||
113 | } | ||
79 | 114 | ||
80 | std::map<kgram, std::map<token, token_data> > tstats; | 115 | std::map<kgram, std::map<token, token_data> > tstats; |
81 | std::map<token, std::map<termstats, int> > tendings; | 116 | std::map<token, std::map<termstats, int> > tendings; |
@@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
88 | 123 | ||
89 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) | 124 | for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) |
90 | { | 125 | { |
91 | token word(canonize(*it)); | 126 | prefix.push_back(token_from_string(*it)); |
92 | |||
93 | if (it->find_first_of(".?!,") != std::string::npos) | ||
94 | { | ||
95 | word.terminating = true; | ||
96 | } | ||
97 | |||
98 | prefix.push_back(word); | ||
99 | } | 127 | } |
100 | 128 | ||
101 | std::string f = tokens[i+k]; | 129 | std::string f = tokens[i+k]; |
102 | std::string canonical = canonize(f); | 130 | std::string canonical = canonize(f); |
103 | 131 | ||
104 | token word(canonical); | 132 | token word(token_from_string(canonical)); |
105 | if (f.find_first_of(".?!,") != std::string::npos) | 133 | if (f.find_first_of(".?!,") != std::string::npos) |
106 | { | 134 | { |
107 | word.terminating = true; | 135 | word.terminating = true; |
@@ -184,11 +212,22 @@ void printKgram(kgram k) | |||
184 | std::cout << "#.# "; | 212 | std::cout << "#.# "; |
185 | } else if (q.type == querytype_literal) | 213 | } else if (q.type == querytype_literal) |
186 | { | 214 | { |
187 | if (q.word.terminating) | 215 | if (q.word.type == tokentype_hashtag) |
216 | { | ||
217 | if (q.word.terminating) | ||
218 | { | ||
219 | std::cout << "#hashtag. "; | ||
220 | } else { | ||
221 | std::cout << "#hashtag "; | ||
222 | } | ||
223 | } else if (q.word.type == tokentype_literal) | ||
188 | { | 224 | { |
189 | std::cout << q.word.canon << ". "; | 225 | if (q.word.terminating) |
190 | } else { | 226 | { |
191 | std::cout << q.word.canon << " "; | 227 | std::cout << q.word.canon << ". "; |
228 | } else { | ||
229 | std::cout << q.word.canon << " "; | ||
230 | } | ||
192 | } | 231 | } |
193 | } | 232 | } |
194 | } | 233 | } |
@@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n) | |||
238 | int max = distribution.rbegin()->first; | 277 | int max = distribution.rbegin()->first; |
239 | int r = rand() % max; | 278 | int r = rand() % max; |
240 | token_data& next = distribution.upper_bound(r)->second; | 279 | token_data& next = distribution.upper_bound(r)->second; |
241 | std::string nextToken(next.word.canon); | 280 | std::string nextToken; |
281 | bool mess = false; | ||
242 | 282 | ||
243 | bool mess = (rand() % 100) == 0; | 283 | if (next.word.type == tokentype_literal) |
244 | if (mess) | ||
245 | { | 284 | { |
246 | nextToken = mstats.alternate(nextToken); | 285 | nextToken = next.word.canon; |
247 | } | ||
248 | 286 | ||
249 | // Determine the casing of the next token. We randomly make the token all | 287 | mess = (rand() % 100) == 0; |
250 | // caps based on the markov chain. Otherwise, we check if the previous | 288 | if (mess) |
251 | // token is the end of a sentence (terminating token or a wildcard query). | 289 | { |
252 | int casing = rand() % next.all; | 290 | nextToken = mstats.alternate(nextToken); |
253 | if (casing < next.uppercase) | 291 | } |
254 | { | 292 | |
255 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 293 | // Determine the casing of the next token. We randomly make the token all |
256 | } else if ((((cur.rbegin()->type == querytype_sentence) | 294 | // caps based on the markov chain. Otherwise, we check if the previous |
257 | || ((cur.rbegin()->type == querytype_literal) | 295 | // token is the end of a sentence (terminating token or a wildcard query). |
258 | && (cur.rbegin()->word.terminating))) | 296 | int casing = rand() % next.all; |
259 | && (rand() % 2 > 0)) | 297 | if (casing < next.uppercase) |
260 | || (casing - next.uppercase < next.titlecase)) | 298 | { |
299 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
300 | } else if ((((cur.rbegin()->type == querytype_sentence) | ||
301 | || ((cur.rbegin()->type == querytype_literal) | ||
302 | && (cur.rbegin()->word.terminating))) | ||
303 | && (rand() % 2 > 0)) | ||
304 | || (casing - next.uppercase < next.titlecase)) | ||
305 | { | ||
306 | nextToken[0] = toupper(nextToken[0]); | ||
307 | } | ||
308 | } else if (next.word.type == tokentype_hashtag) | ||
261 | { | 309 | { |
262 | nextToken[0] = toupper(nextToken[0]); | 310 | int rhash = rand() % hashtags.size(); |
311 | nextToken = hashtags[rhash]; | ||
263 | } | 312 | } |
264 | 313 | ||
265 | if (next.word.terminating) | 314 | if (next.word.terminating) |
266 | { | 315 | { |
267 | std::map<int, termstats>& ending = endings[next.word]; | 316 | std::map<int, termstats>& ending = endings[next.word]; |
268 | int emax = ending.rbegin()->first; | 317 | int emax = ending.rbegin()->first; |
269 | int er = rand() % emax; | 318 | int er = rand() % emax; |
270 | termstats& nextend = ending.upper_bound(er)->second; | 319 | termstats& nextend = ending.upper_bound(er)->second; |
271 | 320 | ||
272 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); | 321 | nextToken.append(std::string(nextend.occurrences, nextend.terminator)); |
273 | } | 322 | } |
274 | 323 | ||
diff --git a/kgramstats.h b/kgramstats.h index ca61df7..ff2fc66 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -7,19 +7,34 @@ | |||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | enum tokentype { | ||
11 | tokentype_literal, | ||
12 | tokentype_hashtag | ||
13 | }; | ||
14 | |||
10 | struct token { | 15 | struct token { |
16 | tokentype type; | ||
11 | std::string canon; | 17 | std::string canon; |
12 | bool terminating; | 18 | bool terminating; |
13 | 19 | ||
14 | token(std::string canon) : canon(canon), terminating(false) {} | 20 | token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} |
21 | token(tokentype type) : type(type), canon(""), terminating(false) {} | ||
15 | 22 | ||
16 | bool operator<(const token& other) const | 23 | bool operator<(const token& other) const |
17 | { | 24 | { |
18 | if (canon == other.canon) | 25 | if (type != other.type) |
19 | { | 26 | { |
20 | return !terminating && other.terminating; | 27 | return type < other.type; |
28 | } else if (type == tokentype_literal) | ||
29 | { | ||
30 | if (canon == other.canon) | ||
31 | { | ||
32 | return !terminating && other.terminating; | ||
33 | } else { | ||
34 | return canon < other.canon; | ||
35 | } | ||
21 | } else { | 36 | } else { |
22 | return canon < other.canon; | 37 | return !terminating && other.terminating; |
23 | } | 38 | } |
24 | } | 39 | } |
25 | }; | 40 | }; |
@@ -94,6 +109,7 @@ private: | |||
94 | std::map<kgram, std::map<int, token_data> > stats; | 109 | std::map<kgram, std::map<int, token_data> > stats; |
95 | malaprop mstats; | 110 | malaprop mstats; |
96 | std::map<token, std::map<int, termstats> > endings; | 111 | std::map<token, std::map<int, termstats> > endings; |
112 | std::vector<std::string> hashtags; | ||
97 | }; | 113 | }; |
98 | 114 | ||
99 | void printKgram(kgram k); | 115 | void printKgram(kgram k); |