diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-28 20:12:45 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-28 20:12:45 -0500 |
| commit | 01fcbeb60da0bff33d5d9f5b870d444cc418a01d (patch) | |
| tree | e506e532c02d08505d4328df37f4fac8c816e89f | |
| parent | 1890eb5d4a496aea5e9114550081ca63bd280f3b (diff) | |
| download | rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.tar.gz rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.tar.bz2 rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.zip | |
Converted to C++ style randomization
The logic in rawr::randomSentence with the cuts might be slightly different now but who even knows what's going on there.
| -rw-r--r-- | CMakeLists.txt | 2 | ||||
| -rw-r--r-- | ebooks.cpp | 16 | ||||
| -rw-r--r-- | gen.cpp | 11 | ||||
| -rw-r--r-- | histogram.cpp | 46 | ||||
| -rw-r--r-- | histogram.h | 53 | ||||
| -rw-r--r-- | kgramstats.cpp | 43 | ||||
| -rw-r--r-- | kgramstats.h | 3 |
7 files changed, 86 insertions, 88 deletions
| diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b73fb0..b35f630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
| @@ -15,7 +15,7 @@ include_directories( | |||
| 15 | ${ASPELL_INCLUDE_DIR} | 15 | ${ASPELL_INCLUDE_DIR} |
| 16 | ${yaml-cpp_INCLUDE_DIRS}) | 16 | ${yaml-cpp_INCLUDE_DIRS}) |
| 17 | 17 | ||
| 18 | add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp) | 18 | add_library(rawr kgramstats.cpp prefix_search.cpp) |
| 19 | set_property(TARGET rawr PROPERTY CXX_STANDARD 11) | 19 | set_property(TARGET rawr PROPERTY CXX_STANDARD 11) |
| 20 | set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) | 20 | set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) |
| 21 | target_link_libraries(rawr ${ASPELL_LIBRARIES}) | 21 | target_link_libraries(rawr ${ASPELL_LIBRARIES}) |
| diff --git a/ebooks.cpp b/ebooks.cpp index 0644132..3918b78 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
| @@ -2,8 +2,6 @@ | |||
| 2 | #include <list> | 2 | #include <list> |
| 3 | #include <map> | 3 | #include <map> |
| 4 | #include "kgramstats.h" | 4 | #include "kgramstats.h" |
| 5 | #include <ctime> | ||
| 6 | #include <cstdlib> | ||
| 7 | #include <fstream> | 5 | #include <fstream> |
| 8 | #include <iostream> | 6 | #include <iostream> |
| 9 | #include <twitter.h> | 7 | #include <twitter.h> |
| @@ -11,14 +9,15 @@ | |||
| 11 | #include <thread> | 9 | #include <thread> |
| 12 | #include <chrono> | 10 | #include <chrono> |
| 13 | #include <algorithm> | 11 | #include <algorithm> |
| 12 | #include <random> | ||
| 14 | 13 | ||
| 15 | const auto QUEUE_TIMEOUT = std::chrono::minutes(1); | 14 | const auto QUEUE_TIMEOUT = std::chrono::minutes(1); |
| 16 | const auto POLL_TIMEOUT = std::chrono::minutes(5); | 15 | const auto POLL_TIMEOUT = std::chrono::minutes(5); |
| 17 | 16 | ||
| 18 | int main(int argc, char** args) | 17 | int main(int argc, char** args) |
| 19 | { | 18 | { |
| 20 | srand(time(NULL)); | 19 | std::random_device randomDevice; |
| 21 | rand(); rand(); rand(); rand(); | 20 | std::mt19937 rng(randomDevice()); |
| 22 | 21 | ||
| 23 | YAML::Node config = YAML::LoadFile("config.yml"); | 22 | YAML::Node config = YAML::LoadFile("config.yml"); |
| 24 | int delay = config["delay"].as<int>(); | 23 | int delay = config["delay"].as<int>(); |
| @@ -72,7 +71,8 @@ int main(int argc, char** args) | |||
| 72 | size_t pos = form.find("$name$"); | 71 | size_t pos = form.find("$name$"); |
| 73 | if (pos != std::string::npos) | 72 | if (pos != std::string::npos) |
| 74 | { | 73 | { |
| 75 | form.replace(pos, 6, fv_names[rand() % fv_names.size()]); | 74 | int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng); |
| 75 | form.replace(pos, 6, fv_names[fvInd]); | ||
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | return form; | 78 | return form; |
| @@ -92,12 +92,12 @@ int main(int argc, char** args) | |||
| 92 | 92 | ||
| 93 | if (currentTime >= genTimer) | 93 | if (currentTime >= genTimer) |
| 94 | { | 94 | { |
| 95 | std::string doc = kgramstats.randomSentence(140); | 95 | std::string doc = kgramstats.randomSentence(140, rng); |
| 96 | doc.resize(140); | 96 | doc.resize(140); |
| 97 | 97 | ||
| 98 | postQueue.emplace_back(std::move(doc), false, 0); | 98 | postQueue.emplace_back(std::move(doc), false, 0); |
| 99 | 99 | ||
| 100 | int genwait = rand() % delay + 1; | 100 | int genwait = std::uniform_int_distribution<int>(1, delay)(rng); |
| 101 | 101 | ||
| 102 | genTimer = currentTime + std::chrono::seconds(genwait); | 102 | genTimer = currentTime + std::chrono::seconds(genwait); |
| 103 | } | 103 | } |
| @@ -125,7 +125,7 @@ int main(int argc, char** args) | |||
| 125 | && tweet.getAuthor() != client.getUser()) | 125 | && tweet.getAuthor() != client.getUser()) |
| 126 | { | 126 | { |
| 127 | std::string doc = tweet.generateReplyPrefill(client.getUser()); | 127 | std::string doc = tweet.generateReplyPrefill(client.getUser()); |
| 128 | doc += kgramstats.randomSentence(140 - doc.length()); | 128 | doc += kgramstats.randomSentence(140 - doc.length(), rng); |
| 129 | doc.resize(140); | 129 | doc.resize(140); |
| 130 | 130 | ||
| 131 | postQueue.emplace_back(std::move(doc), true, tweet.getID()); | 131 | postQueue.emplace_back(std::move(doc), true, tweet.getID()); |
| diff --git a/gen.cpp b/gen.cpp index 4e19f84..952e3b5 100644 --- a/gen.cpp +++ b/gen.cpp | |||
| @@ -2,15 +2,15 @@ | |||
| 2 | #include <list> | 2 | #include <list> |
| 3 | #include <map> | 3 | #include <map> |
| 4 | #include "kgramstats.h" | 4 | #include "kgramstats.h" |
| 5 | #include <ctime> | ||
| 6 | #include <vector> | 5 | #include <vector> |
| 7 | #include <cstdlib> | ||
| 8 | #include <fstream> | 6 | #include <fstream> |
| 9 | #include <iostream> | 7 | #include <iostream> |
| 8 | #include <random> | ||
| 10 | 9 | ||
| 11 | int main(int argc, char** args) | 10 | int main(int argc, char** args) |
| 12 | { | 11 | { |
| 13 | srand(time(NULL)); | 12 | std::random_device randomDevice; |
| 13 | std::mt19937 rng(randomDevice()); | ||
| 14 | 14 | ||
| 15 | if (argc == 1) | 15 | if (argc == 1) |
| 16 | { | 16 | { |
| @@ -73,7 +73,8 @@ int main(int argc, char** args) | |||
| 73 | size_t pos = form.find("$name$"); | 73 | size_t pos = form.find("$name$"); |
| 74 | if (pos != std::string::npos) | 74 | if (pos != std::string::npos) |
| 75 | { | 75 | { |
| 76 | form.replace(pos, 6, fv_names[rand() % fv_names.size()]); | 76 | int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng); |
| 77 | form.replace(pos, 6, fv_names[fvInd]); | ||
| 77 | } | 78 | } |
| 78 | 79 | ||
| 79 | return form; | 80 | return form; |
| @@ -82,7 +83,7 @@ int main(int argc, char** args) | |||
| 82 | std::cout << "Generating..." << std::endl; | 83 | std::cout << "Generating..." << std::endl; |
| 83 | for (;;) | 84 | for (;;) |
| 84 | { | 85 | { |
| 85 | std::string doc = kgramstats.randomSentence(140); | 86 | std::string doc = kgramstats.randomSentence(140, rng); |
| 86 | doc.resize(140); | 87 | doc.resize(140); |
| 87 | 88 | ||
| 88 | std::cout << doc << std::endl; | 89 | std::cout << doc << std::endl; |
| diff --git a/histogram.cpp b/histogram.cpp deleted file mode 100644 index 77c5c3e..0000000 --- a/histogram.cpp +++ /dev/null | |||
| @@ -1,46 +0,0 @@ | |||
| 1 | #include "histogram.h" | ||
| 2 | #include "kgramstats.h" | ||
| 3 | #include <cstdlib> | ||
| 4 | #include <iostream> | ||
| 5 | |||
| 6 | template <class T> | ||
| 7 | void histogram<T>::add(const T& inst) | ||
| 8 | { | ||
| 9 | freqtable[inst]++; | ||
| 10 | } | ||
| 11 | |||
| 12 | template <class T> | ||
| 13 | void histogram<T>::compile() | ||
| 14 | { | ||
| 15 | distribution.clear(); | ||
| 16 | |||
| 17 | int max = 0; | ||
| 18 | for (auto& it : freqtable) | ||
| 19 | { | ||
| 20 | max += it.second; | ||
| 21 | distribution.emplace(max, it.first); | ||
| 22 | } | ||
| 23 | |||
| 24 | freqtable.clear(); | ||
| 25 | } | ||
| 26 | |||
| 27 | template <class T> | ||
| 28 | const T& histogram<T>::next() const | ||
| 29 | { | ||
| 30 | int max = distribution.rbegin()->first; | ||
| 31 | int r = rand() % max; | ||
| 32 | |||
| 33 | return distribution.upper_bound(r)->second; | ||
| 34 | } | ||
| 35 | |||
| 36 | template <class T> | ||
| 37 | void histogram<T>::print() const | ||
| 38 | { | ||
| 39 | for (auto& freqpair : freqtable) | ||
| 40 | { | ||
| 41 | std::cout << freqpair.first << ": " << freqpair.second << std::endl; | ||
| 42 | } | ||
| 43 | } | ||
| 44 | |||
| 45 | template class histogram <std::string>; | ||
| 46 | template class histogram <rawr::terminator>; | ||
| diff --git a/histogram.h b/histogram.h index 76d8f1b..c7e051b 100644 --- a/histogram.h +++ b/histogram.h | |||
| @@ -3,18 +3,53 @@ | |||
| 3 | 3 | ||
| 4 | #include <map> | 4 | #include <map> |
| 5 | #include <string> | 5 | #include <string> |
| 6 | #include <random> | ||
| 7 | #include <iostream> | ||
| 6 | 8 | ||
| 7 | template <class T> | 9 | template <class T> |
| 8 | class histogram { | 10 | class histogram { |
| 9 | public: | 11 | public: |
| 10 | void add(const T& inst); | 12 | |
| 11 | void compile(); | 13 | void add(const T& inst) |
| 12 | const T& next() const; | 14 | { |
| 13 | void print() const; | 15 | freqtable_[inst]++; |
| 14 | 16 | } | |
| 15 | private: | 17 | |
| 16 | std::map<T, int> freqtable; | 18 | void compile() |
| 17 | std::map<int, T> distribution; | 19 | { |
| 20 | distribution_.clear(); | ||
| 21 | |||
| 22 | int max = 0; | ||
| 23 | for (auto& it : freqtable_) | ||
| 24 | { | ||
| 25 | max += it.second; | ||
| 26 | distribution_.emplace(max, it.first); | ||
| 27 | } | ||
| 28 | |||
| 29 | freqtable_.clear(); | ||
| 30 | } | ||
| 31 | |||
| 32 | const T& next(std::mt19937& rng) const | ||
| 33 | { | ||
| 34 | int max = distribution_.rbegin()->first; | ||
| 35 | std::uniform_int_distribution<int> randDist(0, max - 1); | ||
| 36 | int r = randDist(rng); | ||
| 37 | |||
| 38 | return distribution_.upper_bound(r)->second; | ||
| 39 | } | ||
| 40 | |||
| 41 | void print() const | ||
| 42 | { | ||
| 43 | for (auto& freqpair : freqtable_) | ||
| 44 | { | ||
| 45 | std::cout << freqpair.first << ": " << freqpair.second << std::endl; | ||
| 46 | } | ||
| 47 | } | ||
| 48 | |||
| 49 | private: | ||
| 50 | |||
| 51 | std::map<T, int> freqtable_; | ||
| 52 | std::map<int, T> distribution_; | ||
| 18 | }; | 53 | }; |
| 19 | 54 | ||
| 20 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | 55 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ |
| diff --git a/kgramstats.cpp b/kgramstats.cpp index b0a83dc..6148dd3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -590,7 +590,7 @@ void rawr::setMinCorpora(int _arg) | |||
| 590 | } | 590 | } |
| 591 | 591 | ||
| 592 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 592 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
| 593 | std::string rawr::randomSentence(int maxL) const | 593 | std::string rawr::randomSentence(int maxL, std::mt19937& rng) const |
| 594 | { | 594 | { |
| 595 | if (!_compiled) | 595 | if (!_compiled) |
| 596 | { | 596 | { |
| @@ -610,16 +610,13 @@ std::string rawr::randomSentence(int maxL) const | |||
| 610 | cur.pop_front(); | 610 | cur.pop_front(); |
| 611 | } | 611 | } |
| 612 | 612 | ||
| 613 | do | 613 | while (cur.size() > 2 && |
| 614 | cuts > 0 && | ||
| 615 | !std::bernoulli_distribution(1.0 / static_cast<double>(cuts))(rng)) | ||
| 614 | { | 616 | { |
| 615 | if ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)) | ||
| 616 | { | ||
| 617 | cur.pop_front(); | 617 | cur.pop_front(); |
| 618 | cuts--; | 618 | cuts--; |
| 619 | } else { | 619 | } |
| 620 | break; | ||
| 621 | } | ||
| 622 | } while ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)); | ||
| 623 | 620 | ||
| 624 | // Gotta circumvent the last line of the input corpus | 621 | // Gotta circumvent the last line of the input corpus |
| 625 | // https://twitter.com/starla4444/status/684222271339237376 | 622 | // https://twitter.com/starla4444/status/684222271339237376 |
| @@ -627,7 +624,8 @@ std::string rawr::randomSentence(int maxL) const | |||
| 627 | { | 624 | { |
| 628 | // The end of a corpus should probably be treated like a terminator, so | 625 | // The end of a corpus should probably be treated like a terminator, so |
| 629 | // maybe we should just end here. | 626 | // maybe we should just end here. |
| 630 | if ((result.length() > maxL) || (rand() % 4 == 0)) | 627 | if (result.length() > maxL || |
| 628 | std::bernoulli_distribution(1.0 / 4.0)(rng)) | ||
| 631 | { | 629 | { |
| 632 | break; | 630 | break; |
| 633 | } | 631 | } |
| @@ -637,10 +635,11 @@ std::string rawr::randomSentence(int maxL) const | |||
| 637 | 635 | ||
| 638 | auto& distribution = _stats.at(cur); | 636 | auto& distribution = _stats.at(cur); |
| 639 | int max = distribution.rbegin()->first; | 637 | int max = distribution.rbegin()->first; |
| 640 | int r = rand() % max; | 638 | std::uniform_int_distribution<int> randDist(0, max - 1); |
| 639 | int r = randDist(rng); | ||
| 641 | const token_data& next = distribution.upper_bound(r)->second; | 640 | const token_data& next = distribution.upper_bound(r)->second; |
| 642 | const token& interned = _tokenstore.get(next.tok); | 641 | const token& interned = _tokenstore.get(next.tok); |
| 643 | std::string nextToken = interned.w.forms.next(); | 642 | std::string nextToken = interned.w.forms.next(rng); |
| 644 | 643 | ||
| 645 | // Apply user-specified transforms | 644 | // Apply user-specified transforms |
| 646 | if (_transform) | 645 | if (_transform) |
| @@ -651,10 +650,16 @@ std::string rawr::randomSentence(int maxL) const | |||
| 651 | // Determine the casing of the next token. We randomly make the token all | 650 | // Determine the casing of the next token. We randomly make the token all |
| 652 | // caps based on the markov chain. Otherwise, we check if the previous | 651 | // caps based on the markov chain. Otherwise, we check if the previous |
| 653 | // token is the end of a sentence (terminating token or a wildcard query). | 652 | // token is the end of a sentence (terminating token or a wildcard query). |
| 654 | int casing = rand() % next.all; | 653 | std::uniform_int_distribution<int> caseDist(0, next.all - 1); |
| 654 | int casing = caseDist(rng); | ||
| 655 | |||
| 655 | if (casing < next.uppercase) | 656 | if (casing < next.uppercase) |
| 656 | { | 657 | { |
| 657 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 658 | std::transform( |
| 659 | std::begin(nextToken), | ||
| 660 | std::end(nextToken), | ||
| 661 | std::begin(nextToken), | ||
| 662 | ::toupper); | ||
| 658 | } else { | 663 | } else { |
| 659 | bool capitalize = false; | 664 | bool capitalize = false; |
| 660 | 665 | ||
| @@ -663,7 +668,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 663 | capitalize = true; | 668 | capitalize = true; |
| 664 | } else if (cur.rbegin()->type == querytype::sentence) | 669 | } else if (cur.rbegin()->type == querytype::sentence) |
| 665 | { | 670 | { |
| 666 | if (rand() % 2 > 0) | 671 | if (std::bernoulli_distribution(1.0 / 2.0)(rng)) |
| 667 | { | 672 | { |
| 668 | capitalize = true; | 673 | capitalize = true; |
| 669 | } | 674 | } |
| @@ -671,7 +676,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 671 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | 676 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); |
| 672 | 677 | ||
| 673 | if (lastTok.suffix == suffixtype::terminating && | 678 | if (lastTok.suffix == suffixtype::terminating && |
| 674 | rand() % 2 > 0) | 679 | std::bernoulli_distribution(1.0 / 2.0)(rng)) |
| 675 | { | 680 | { |
| 676 | capitalize = true; | 681 | capitalize = true; |
| 677 | } | 682 | } |
| @@ -753,7 +758,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 753 | // Terminators | 758 | // Terminators |
| 754 | if (interned.suffix == suffixtype::terminating) | 759 | if (interned.suffix == suffixtype::terminating) |
| 755 | { | 760 | { |
| 756 | auto term = interned.w.terms.next(); | 761 | auto term = interned.w.terms.next(rng); |
| 757 | nextToken.append(term.form); | 762 | nextToken.append(term.form); |
| 758 | 763 | ||
| 759 | if (term.newline) | 764 | if (term.newline) |
| @@ -794,7 +799,9 @@ std::string rawr::randomSentence(int maxL) const | |||
| 794 | cur.push_back(next.tok); | 799 | cur.push_back(next.tok); |
| 795 | result.append(nextToken); | 800 | result.append(nextToken); |
| 796 | 801 | ||
| 797 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 802 | if (interned.suffix == suffixtype::terminating && |
| 803 | (result.length() > maxL || | ||
| 804 | std::bernoulli_distribution(1.0 / 4.0)(rng))) | ||
| 798 | { | 805 | { |
| 799 | break; | 806 | break; |
| 800 | } | 807 | } |
| @@ -803,7 +810,7 @@ std::string rawr::randomSentence(int maxL) const | |||
| 803 | // Ensure that enough corpora are used | 810 | // Ensure that enough corpora are used |
| 804 | if (used_corpora.size() < _min_corpora) | 811 | if (used_corpora.size() < _min_corpora) |
| 805 | { | 812 | { |
| 806 | return randomSentence(maxL); | 813 | return randomSentence(maxL, rng); |
| 807 | } | 814 | } |
| 808 | 815 | ||
| 809 | // Remove the trailing space | 816 | // Remove the trailing space |
| diff --git a/kgramstats.h b/kgramstats.h index 49fe04e..848af24 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include "identifier.h" | 9 | #include "identifier.h" |
| 10 | #include <functional> | 10 | #include <functional> |
| 11 | #include <set> | 11 | #include <set> |
| 12 | #include <random> | ||
| 12 | 13 | ||
| 13 | class rawr { | 14 | class rawr { |
| 14 | public: | 15 | public: |
| @@ -19,7 +20,7 @@ class rawr { | |||
| 19 | 20 | ||
| 20 | void setTransformCallback(transform_callback _arg); | 21 | void setTransformCallback(transform_callback _arg); |
| 21 | void setMinCorpora(int _arg); | 22 | void setMinCorpora(int _arg); |
| 22 | std::string randomSentence(int maxL) const; | 23 | std::string randomSentence(int maxL, std::mt19937& rng) const; |
| 23 | 24 | ||
| 24 | private: | 25 | private: |
| 25 | struct terminator { | 26 | struct terminator { |
