about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--kgramstats.cpp115
-rw-r--r--kgramstats.h24
2 files changed, 102 insertions, 37 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence);
43 43
44std::string canonize(std::string f); 44std::string canonize(std::string f);
45 45
46token token_from_string(std::string in)
47{
48 if (in[0] == '#')
49 {
50 token word(tokentype_hashtag);
51
52 if (in.find_first_of(".?!,") != std::string::npos)
53 {
54 word.terminating = true;
55 }
56
57 return word;
58 } else {
59 token word(canonize(in));
60
61 if (in.find_first_of(".?!,") != std::string::npos)
62 {
63 word.terminating = true;
64 }
65
66 return word;
67 }
68}
69
46// runs in O(t^2) time where t is the number of tokens in the input corpus 70// runs in O(t^2) time where t is the number of tokens in the input corpus
47// We consider maxK to be fairly constant 71// We consider maxK to be fairly constant
48kgramstats::kgramstats(std::string corpus, int maxK) 72kgramstats::kgramstats(std::string corpus, int maxK)
@@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
52 std::vector<std::string> tokens; 76 std::vector<std::string> tokens;
53 size_t start = 0; 77 size_t start = 0;
54 int end = 0; 78 int end = 0;
79 std::set<std::string> thashtags;
55 80
56 while (end != std::string::npos) 81 while (end != std::string::npos)
57 { 82 {
@@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK)
72 { 97 {
73 mstats.addWord(token); 98 mstats.addWord(token);
74 tokens.push_back(token); 99 tokens.push_back(token);
100
101 if (token[0] == '#')
102 {
103 thashtags.insert(canonize(token));
104 }
75 } 105 }
76 106
77 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 107 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
78 } 108 }
109
110 for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
111 {
112 hashtags.push_back(*it);
113 }
79 114
80 std::map<kgram, std::map<token, token_data> > tstats; 115 std::map<kgram, std::map<token, token_data> > tstats;
81 std::map<token, std::map<termstats, int> > tendings; 116 std::map<token, std::map<termstats, int> > tendings;
@@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
88 123
89 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++) 124 for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
90 { 125 {
91 token word(canonize(*it)); 126 prefix.push_back(token_from_string(*it));
92
93 if (it->find_first_of(".?!,") != std::string::npos)
94 {
95 word.terminating = true;
96 }
97
98 prefix.push_back(word);
99 } 127 }
100 128
101 std::string f = tokens[i+k]; 129 std::string f = tokens[i+k];
102 std::string canonical = canonize(f); 130 std::string canonical = canonize(f);
103 131
104 token word(canonical); 132 token word(token_from_string(canonical));
105 if (f.find_first_of(".?!,") != std::string::npos) 133 if (f.find_first_of(".?!,") != std::string::npos)
106 { 134 {
107 word.terminating = true; 135 word.terminating = true;
@@ -184,11 +212,22 @@ void printKgram(kgram k)
184 std::cout << "#.# "; 212 std::cout << "#.# ";
185 } else if (q.type == querytype_literal) 213 } else if (q.type == querytype_literal)
186 { 214 {
187 if (q.word.terminating) 215 if (q.word.type == tokentype_hashtag)
216 {
217 if (q.word.terminating)
218 {
219 std::cout << "#hashtag. ";
220 } else {
221 std::cout << "#hashtag ";
222 }
223 } else if (q.word.type == tokentype_literal)
188 { 224 {
189 std::cout << q.word.canon << ". "; 225 if (q.word.terminating)
190 } else { 226 {
191 std::cout << q.word.canon << " "; 227 std::cout << q.word.canon << ". ";
228 } else {
229 std::cout << q.word.canon << " ";
230 }
192 } 231 }
193 } 232 }
194 } 233 }
@@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n)
238 int max = distribution.rbegin()->first; 277 int max = distribution.rbegin()->first;
239 int r = rand() % max; 278 int r = rand() % max;
240 token_data& next = distribution.upper_bound(r)->second; 279 token_data& next = distribution.upper_bound(r)->second;
241 std::string nextToken(next.word.canon); 280 std::string nextToken;
281 bool mess = false;
242 282
243 bool mess = (rand() % 100) == 0; 283 if (next.word.type == tokentype_literal)
244 if (mess)
245 { 284 {
246 nextToken = mstats.alternate(nextToken); 285 nextToken = next.word.canon;
247 }
248 286
249 // Determine the casing of the next token. We randomly make the token all 287 mess = (rand() % 100) == 0;
250 // caps based on the markov chain. Otherwise, we check if the previous 288 if (mess)
251 // token is the end of a sentence (terminating token or a wildcard query). 289 {
252 int casing = rand() % next.all; 290 nextToken = mstats.alternate(nextToken);
253 if (casing < next.uppercase) 291 }
254 { 292
255 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 293 // Determine the casing of the next token. We randomly make the token all
256 } else if ((((cur.rbegin()->type == querytype_sentence) 294 // caps based on the markov chain. Otherwise, we check if the previous
257 || ((cur.rbegin()->type == querytype_literal) 295 // token is the end of a sentence (terminating token or a wildcard query).
258 && (cur.rbegin()->word.terminating))) 296 int casing = rand() % next.all;
259 && (rand() % 2 > 0)) 297 if (casing < next.uppercase)
260 || (casing - next.uppercase < next.titlecase)) 298 {
299 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
300 } else if ((((cur.rbegin()->type == querytype_sentence)
301 || ((cur.rbegin()->type == querytype_literal)
302 && (cur.rbegin()->word.terminating)))
303 && (rand() % 2 > 0))
304 || (casing - next.uppercase < next.titlecase))
305 {
306 nextToken[0] = toupper(nextToken[0]);
307 }
308 } else if (next.word.type == tokentype_hashtag)
261 { 309 {
262 nextToken[0] = toupper(nextToken[0]); 310 int rhash = rand() % hashtags.size();
311 nextToken = hashtags[rhash];
263 } 312 }
264 313
265 if (next.word.terminating) 314 if (next.word.terminating)
266 { 315 {
267 std::map<int, termstats>& ending = endings[next.word]; 316 std::map<int, termstats>& ending = endings[next.word];
268 int emax = ending.rbegin()->first; 317 int emax = ending.rbegin()->first;
269 int er = rand() % emax; 318 int er = rand() % emax;
270 termstats& nextend = ending.upper_bound(er)->second; 319 termstats& nextend = ending.upper_bound(er)->second;
271 320
272 nextToken.append(std::string(nextend.occurrences, nextend.terminator)); 321 nextToken.append(std::string(nextend.occurrences, nextend.terminator));
273 } 322 }
274 323
diff --git a/kgramstats.h b/kgramstats.h index ca61df7..ff2fc66 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,19 +7,34 @@
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10enum tokentype {
11 tokentype_literal,
12 tokentype_hashtag
13};
14
10struct token { 15struct token {
16 tokentype type;
11 std::string canon; 17 std::string canon;
12 bool terminating; 18 bool terminating;
13 19
14 token(std::string canon) : canon(canon), terminating(false) {} 20 token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
21 token(tokentype type) : type(type), canon(""), terminating(false) {}
15 22
16 bool operator<(const token& other) const 23 bool operator<(const token& other) const
17 { 24 {
18 if (canon == other.canon) 25 if (type != other.type)
19 { 26 {
20 return !terminating && other.terminating; 27 return type < other.type;
28 } else if (type == tokentype_literal)
29 {
30 if (canon == other.canon)
31 {
32 return !terminating && other.terminating;
33 } else {
34 return canon < other.canon;
35 }
21 } else { 36 } else {
22 return canon < other.canon; 37 return !terminating && other.terminating;
23 } 38 }
24 } 39 }
25}; 40};
@@ -94,6 +109,7 @@ private:
94 std::map<kgram, std::map<int, token_data> > stats; 109 std::map<kgram, std::map<int, token_data> > stats;
95 malaprop mstats; 110 malaprop mstats;
96 std::map<token, std::map<int, termstats> > endings; 111 std::map<token, std::map<int, termstats> > endings;
112 std::vector<std::string> hashtags;
97}; 113};
98 114
99void printKgram(kgram k); 115void printKgram(kgram k);