From 01fcbeb60da0bff33d5d9f5b870d444cc418a01d Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Thu, 28 Feb 2019 20:12:45 -0500 Subject: Converted to C++ style randomization The logic in rawr::randomSentence with the cuts might be slightly different now but who even knows what's going on there. --- CMakeLists.txt | 2 +- ebooks.cpp | 16 ++++++++-------- gen.cpp | 11 ++++++----- histogram.cpp | 46 ---------------------------------------------- histogram.h | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- kgramstats.cpp | 43 +++++++++++++++++++++++++------------------ kgramstats.h | 3 ++- 7 files changed, 86 insertions(+), 88 deletions(-) delete mode 100644 histogram.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b73fb0..b35f630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ include_directories( ${ASPELL_INCLUDE_DIR} ${yaml-cpp_INCLUDE_DIRS}) -add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp) +add_library(rawr kgramstats.cpp prefix_search.cpp) set_property(TARGET rawr PROPERTY CXX_STANDARD 11) set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) target_link_libraries(rawr ${ASPELL_LIBRARIES}) diff --git a/ebooks.cpp b/ebooks.cpp index 0644132..3918b78 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -2,8 +2,6 @@ #include #include #include "kgramstats.h" -#include -#include #include #include #include @@ -11,14 +9,15 @@ #include #include #include +#include const auto QUEUE_TIMEOUT = std::chrono::minutes(1); const auto POLL_TIMEOUT = std::chrono::minutes(5); int main(int argc, char** args) { - srand(time(NULL)); - rand(); rand(); rand(); rand(); + std::random_device randomDevice; + std::mt19937 rng(randomDevice()); YAML::Node config = YAML::LoadFile("config.yml"); int delay = config["delay"].as(); @@ -72,7 +71,8 @@ int main(int argc, char** args) size_t pos = form.find("$name$"); if (pos != std::string::npos) { - form.replace(pos, 6, fv_names[rand() % fv_names.size()]); + int fvInd = std::uniform_int_distribution(0, fv_names.size()-1)(rng); + form.replace(pos, 6, fv_names[fvInd]); } return form; @@ -92,12 +92,12 @@ int main(int argc, char** args) if (currentTime >= genTimer) { - std::string doc = kgramstats.randomSentence(140); + std::string doc = kgramstats.randomSentence(140, rng); doc.resize(140); postQueue.emplace_back(std::move(doc), false, 0); - int genwait = rand() % delay + 1; + int genwait = std::uniform_int_distribution(1, delay)(rng); genTimer = currentTime + std::chrono::seconds(genwait); } @@ -125,7 +125,7 @@ int main(int argc, char** args) && tweet.getAuthor() != client.getUser()) { std::string doc = tweet.generateReplyPrefill(client.getUser()); - doc += kgramstats.randomSentence(140 - doc.length()); + doc += kgramstats.randomSentence(140 - doc.length(), rng); doc.resize(140); postQueue.emplace_back(std::move(doc), true, tweet.getID()); diff --git a/gen.cpp b/gen.cpp index 4e19f84..952e3b5 100644 --- a/gen.cpp +++ b/gen.cpp @@ -2,15 +2,15 @@ #include #include #include "kgramstats.h" -#include #include -#include #include #include +#include int main(int argc, char** args) { - srand(time(NULL)); + std::random_device randomDevice; + std::mt19937 rng(randomDevice()); if (argc == 1) { @@ -73,7 +73,8 @@ int main(int argc, char** args) size_t pos = form.find("$name$"); if (pos != std::string::npos) { - form.replace(pos, 6, fv_names[rand() % fv_names.size()]); + int fvInd = std::uniform_int_distribution(0, fv_names.size()-1)(rng); + form.replace(pos, 6, fv_names[fvInd]); } return form; @@ -82,7 +83,7 @@ int main(int argc, char** args) std::cout << "Generating..." << std::endl; for (;;) { - std::string doc = kgramstats.randomSentence(140); + std::string doc = kgramstats.randomSentence(140, rng); doc.resize(140); std::cout << doc << std::endl; diff --git a/histogram.cpp b/histogram.cpp deleted file mode 100644 index 77c5c3e..0000000 --- a/histogram.cpp +++ /dev/null @@ -1,46 +0,0 @@ -#include "histogram.h" -#include "kgramstats.h" -#include -#include - -template -void histogram::add(const T& inst) -{ - freqtable[inst]++; -} - -template -void histogram::compile() -{ - distribution.clear(); - - int max = 0; - for (auto& it : freqtable) - { - max += it.second; - distribution.emplace(max, it.first); - } - - freqtable.clear(); -} - -template -const T& histogram::next() const -{ - int max = distribution.rbegin()->first; - int r = rand() % max; - - return distribution.upper_bound(r)->second; -} - -template -void histogram::print() const -{ - for (auto& freqpair : freqtable) - { - std::cout << freqpair.first << ": " << freqpair.second << std::endl; - } -} - -template class histogram ; -template class histogram ; diff --git a/histogram.h b/histogram.h index 76d8f1b..c7e051b 100644 --- a/histogram.h +++ b/histogram.h @@ -3,18 +3,53 @@ #include #include +#include +#include template class histogram { - public: - void add(const T& inst); - void compile(); - const T& next() const; - void print() const; - - private: - std::map freqtable; - std::map distribution; +public: + + void add(const T& inst) + { + freqtable_[inst]++; + } + + void compile() + { + distribution_.clear(); + + int max = 0; + for (auto& it : freqtable_) + { + max += it.second; + distribution_.emplace(max, it.first); + } + + freqtable_.clear(); + } + + const T& next(std::mt19937& rng) const + { + int max = distribution_.rbegin()->first; + std::uniform_int_distribution randDist(0, max - 1); + int r = randDist(rng); + + return distribution_.upper_bound(r)->second; + } + + void print() const + { + for (auto& freqpair : freqtable_) + { + std::cout << freqpair.first << ": " << freqpair.second << std::endl; + } + } + +private: + + std::map freqtable_; + std::map distribution_; }; #endif /* end of include guard: HISTOGRAM_H_24094D97 */ diff --git a/kgramstats.cpp b/kgramstats.cpp index b0a83dc..6148dd3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -590,7 +590,7 @@ void rawr::setMinCorpora(int _arg) } // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus -std::string rawr::randomSentence(int maxL) const +std::string rawr::randomSentence(int maxL, std::mt19937& rng) const { if (!_compiled) { @@ -610,16 +610,13 @@ std::string rawr::randomSentence(int maxL) const cur.pop_front(); } - do + while (cur.size() > 2 && + cuts > 0 && + !std::bernoulli_distribution(1.0 / static_cast(cuts))(rng)) { - if ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)) - { cur.pop_front(); cuts--; - } else { - break; - } - } while ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)); + } // Gotta circumvent the last line of the input corpus // https://twitter.com/starla4444/status/684222271339237376 @@ -627,7 +624,8 @@ std::string rawr::randomSentence(int maxL) const { // The end of a corpus should probably be treated like a terminator, so // maybe we should just end here. - if ((result.length() > maxL) || (rand() % 4 == 0)) + if (result.length() > maxL || + std::bernoulli_distribution(1.0 / 4.0)(rng)) { break; } @@ -637,10 +635,11 @@ std::string rawr::randomSentence(int maxL) const auto& distribution = _stats.at(cur); int max = distribution.rbegin()->first; - int r = rand() % max; + std::uniform_int_distribution randDist(0, max - 1); + int r = randDist(rng); const token_data& next = distribution.upper_bound(r)->second; const token& interned = _tokenstore.get(next.tok); - std::string nextToken = interned.w.forms.next(); + std::string nextToken = interned.w.forms.next(rng); // Apply user-specified transforms if (_transform) @@ -651,10 +650,16 @@ std::string rawr::randomSentence(int maxL) const // Determine the casing of the next token. We randomly make the token all // caps based on the markov chain. Otherwise, we check if the previous // token is the end of a sentence (terminating token or a wildcard query). - int casing = rand() % next.all; + std::uniform_int_distribution caseDist(0, next.all - 1); + int casing = caseDist(rng); + if (casing < next.uppercase) { - std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); + std::transform( + std::begin(nextToken), + std::end(nextToken), + std::begin(nextToken), + ::toupper); } else { bool capitalize = false; @@ -663,7 +668,7 @@ std::string rawr::randomSentence(int maxL) const capitalize = true; } else if (cur.rbegin()->type == querytype::sentence) { - if (rand() % 2 > 0) + if (std::bernoulli_distribution(1.0 / 2.0)(rng)) { capitalize = true; } @@ -671,7 +676,7 @@ std::string rawr::randomSentence(int maxL) const const token& lastTok = _tokenstore.get(cur.rbegin()->tok); if (lastTok.suffix == suffixtype::terminating && - rand() % 2 > 0) + std::bernoulli_distribution(1.0 / 2.0)(rng)) { capitalize = true; } @@ -753,7 +758,7 @@ std::string rawr::randomSentence(int maxL) const // Terminators if (interned.suffix == suffixtype::terminating) { - auto term = interned.w.terms.next(); + auto term = interned.w.terms.next(rng); nextToken.append(term.form); if (term.newline) @@ -794,7 +799,9 @@ std::string rawr::randomSentence(int maxL) const cur.push_back(next.tok); result.append(nextToken); - if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) + if (interned.suffix == suffixtype::terminating && + (result.length() > maxL || + std::bernoulli_distribution(1.0 / 4.0)(rng))) { break; } @@ -803,7 +810,7 @@ std::string rawr::randomSentence(int maxL) const // Ensure that enough corpora are used if (used_corpora.size() < _min_corpora) { - return randomSentence(maxL); + return randomSentence(maxL, rng); } // Remove the trailing space diff --git a/kgramstats.h b/kgramstats.h index 49fe04e..848af24 100644 --- a/kgramstats.h +++ b/kgramstats.h @@ -9,6 +9,7 @@ #include "identifier.h" #include #include +#include class rawr { public: @@ -19,7 +20,7 @@ class rawr { void setTransformCallback(transform_callback _arg); void setMinCorpora(int _arg); - std::string randomSentence(int maxL) const; + std::string randomSentence(int maxL, std::mt19937& rng) const; private: struct terminator { -- cgit 1.4.1