From 1d15f748200f093d869c6fcc38d6053903ff5062 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Mon, 25 Jan 2016 09:13:14 -0500 Subject: hashtags are now randomized --- kgramstats.cpp | 115 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 33 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence); std::string canonize(std::string f); +token token_from_string(std::string in) +{ + if (in[0] == '#') + { + token word(tokentype_hashtag); + + if (in.find_first_of(".?!,") != std::string::npos) + { + word.terminating = true; + } + + return word; + } else { + token word(canonize(in)); + + if (in.find_first_of(".?!,") != std::string::npos) + { + word.terminating = true; + } + + return word; + } +} + // runs in O(t^2) time where t is the number of tokens in the input corpus // We consider maxK to be fairly constant kgramstats::kgramstats(std::string corpus, int maxK) @@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) std::vector tokens; size_t start = 0; int end = 0; + std::set thashtags; while (end != std::string::npos) { @@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK) { mstats.addWord(token); tokens.push_back(token); + + if (token[0] == '#') + { + thashtags.insert(canonize(token)); + } } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } + + for (std::set::iterator it = thashtags.begin(); it != thashtags.end(); it++) + { + hashtags.push_back(*it); + } std::map > tstats; std::map > tendings; @@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) for (std::list::iterator it = seq.begin(); it != seq.end(); it++) { - token word(canonize(*it)); - - if (it->find_first_of(".?!,") != std::string::npos) - { - word.terminating = true; - } - - prefix.push_back(word); + prefix.push_back(token_from_string(*it)); } std::string f = tokens[i+k]; std::string canonical = canonize(f); - token word(canonical); + token word(token_from_string(canonical)); if (f.find_first_of(".?!,") != std::string::npos) { word.terminating = true; @@ -184,11 +212,22 @@ void printKgram(kgram k) std::cout << "#.# "; } else if (q.type == querytype_literal) { - if (q.word.terminating) + if (q.word.type == tokentype_hashtag) + { + if (q.word.terminating) + { + std::cout << "#hashtag. "; + } else { + std::cout << "#hashtag "; + } + } else if (q.word.type == tokentype_literal) { - std::cout << q.word.canon << ". "; - } else { - std::cout << q.word.canon << " "; + if (q.word.terminating) + { + std::cout << q.word.canon << ". "; + } else { + std::cout << q.word.canon << " "; + } } } } @@ -238,37 +277,47 @@ std::vector kgramstats::randomSentence(int n) int max = distribution.rbegin()->first; int r = rand() % max; token_data& next = distribution.upper_bound(r)->second; - std::string nextToken(next.word.canon); + std::string nextToken; + bool mess = false; - bool mess = (rand() % 100) == 0; - if (mess) + if (next.word.type == tokentype_literal) { - nextToken = mstats.alternate(nextToken); - } + nextToken = next.word.canon; - // Determine the casing of the next token. We randomly make the token all - // caps based on the markov chain. Otherwise, we check if the previous - // token is the end of a sentence (terminating token or a wildcard query). - int casing = rand() % next.all; - if (casing < next.uppercase) - { - std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); - } else if ((((cur.rbegin()->type == querytype_sentence) - || ((cur.rbegin()->type == querytype_literal) - && (cur.rbegin()->word.terminating))) - && (rand() % 2 > 0)) - || (casing - next.uppercase < next.titlecase)) + mess = (rand() % 100) == 0; + if (mess) + { + nextToken = mstats.alternate(nextToken); + } + + // Determine the casing of the next token. We randomly make the token all + // caps based on the markov chain. Otherwise, we check if the previous + // token is the end of a sentence (terminating token or a wildcard query). + int casing = rand() % next.all; + if (casing < next.uppercase) + { + std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); + } else if ((((cur.rbegin()->type == querytype_sentence) + || ((cur.rbegin()->type == querytype_literal) + && (cur.rbegin()->word.terminating))) + && (rand() % 2 > 0)) + || (casing - next.uppercase < next.titlecase)) + { + nextToken[0] = toupper(nextToken[0]); + } + } else if (next.word.type == tokentype_hashtag) { - nextToken[0] = toupper(nextToken[0]); + int rhash = rand() % hashtags.size(); + nextToken = hashtags[rhash]; } - + if (next.word.terminating) { std::map& ending = endings[next.word]; int emax = ending.rbegin()->first; int er = rand() % emax; termstats& nextend = ending.upper_bound(er)->second; - + nextToken.append(std::string(nextend.occurrences, nextend.terminator)); } -- cgit 1.4.1