From 5ce05b81520d06a78165c5c5039007c9f29d4b23 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Tue, 31 May 2016 21:39:39 -0400 Subject: Added ability to require a minimum number of corpora in generated output Also fixed a bug with tokenizing multiple corpora. --- kgramstats.cpp | 134 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 47 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index e0c2eac..cb63db6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -56,8 +56,7 @@ void rawr::compile(int maxK) { _maxK = maxK; - std::vector tokens; - size_t start = 0; + std::vector> tokens; std::set thashtags; std::set fv_emoticons; @@ -119,7 +118,9 @@ void rawr::compile(int maxK) std::cout.fill(' '); for (int i = 0; i < _corpora.size(); i++) { + size_t start = 0; int end = 0; + std::vector tkcor; while (end != std::string::npos) { @@ -331,12 +332,14 @@ void rawr::compile(int maxK) } } - tokens.push_back(tk); + tkcor.push_back(tk); } start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); } + tokens.push_back(tkcor); + startper += _corpora[i].length(); } @@ -372,65 +375,82 @@ void rawr::compile(int maxK) // kgram distribution std::cout << "Creating markov chain... 0%" << std::flush; std::map > tstats; - len = (maxK-1) * tokens.size(); + + len = 0; + for (auto c : tokens) + { + len += (maxK-1) * c.size(); + } + + startper = 0; per = 0; perprime = 0; - for (int k=1; ktok.suffix == suffixtype::terminating) - { - kgram term_prefix(prefix); - term_prefix.pop_front(); - term_prefix.push_front(wildcardQuery); - - if (tstats[term_prefix].count(f) == 0) + if (tstats[prefix].count(f) == 0) { - tstats[term_prefix].emplace(f, f); + tstats[prefix].emplace(f, f); } - - token_data& td2 = tstats[term_prefix].at(f); - td2.all++; + + token_data& td = tstats[prefix].at(f); + td.all++; + td.corpora.insert(corpid); if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) { - td2.uppercase++; + td.uppercase++; } else if (isupper(f.raw[0])) { - td2.titlecase++; + td.titlecase++; + } + + if (std::begin(prefix)->tok.suffix == suffixtype::terminating) + { + kgram term_prefix(prefix); + term_prefix.pop_front(); + term_prefix.push_front(wildcardQuery); + + if (tstats[term_prefix].count(f) == 0) + { + tstats[term_prefix].emplace(f, f); + } + + token_data& td2 = tstats[term_prefix].at(f); + td2.all++; + td2.corpora.insert(corpid); + + if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) + { + td2.uppercase++; + } else if (isupper(f.raw[0])) + { + td2.titlecase++; + } } } + + startper += corpus.size(); } + + corpid++; } std::cout << "\b\b\b\b100%" << std::endl; @@ -527,6 +547,11 @@ void rawr::setTransformCallback(transform_callback _arg) _transform = _arg; } +void rawr::setMinCorpora(int _arg) +{ + _min_corpora = _arg; +} + // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus std::string rawr::randomSentence(int maxL) { @@ -539,6 +564,7 @@ std::string rawr::randomSentence(int maxL) kgram cur(1, wildcardQuery); int cuts = 0; std::stack open_delimiters; + std::set used_corpora; for (;;) { @@ -690,9 +716,19 @@ std::string rawr::randomSentence(int maxL) { cuts++; } + + if (next.corpora.size() == 1) + { + used_corpora.insert(*next.corpora.begin()); + } /* DEBUG */ - std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; + std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << " in corp"; + for (auto cor : next.corpora) + { + std::cout << " " << cor; + } + std::cout << std::endl; cur.push_back(next.tok); result.append(nextToken); @@ -703,6 +739,12 @@ std::string rawr::randomSentence(int maxL) } } + // Ensure that enough corpora are used + if (used_corpora.size() < _min_corpora) + { + return randomSentence(maxL); + } + // Remove the trailing space if (result.back() == ' ') { @@ -722,8 +764,6 @@ std::string rawr::randomSentence(int maxL) open_delimiters.pop(); } - - result.resize(maxL); return result; } -- cgit 1.4.1