diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-28 20:12:45 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2019-02-28 20:12:45 -0500 |
commit | 01fcbeb60da0bff33d5d9f5b870d444cc418a01d (patch) | |
tree | e506e532c02d08505d4328df37f4fac8c816e89f | |
parent | 1890eb5d4a496aea5e9114550081ca63bd280f3b (diff) | |
download | rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.tar.gz rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.tar.bz2 rawr-ebooks-01fcbeb60da0bff33d5d9f5b870d444cc418a01d.zip |
Converted to C++ style randomization
The logic in rawr::randomSentence with the cuts might be slightly different now but who even knows what's going on there.
-rw-r--r-- | CMakeLists.txt | 2 | ||||
-rw-r--r-- | ebooks.cpp | 16 | ||||
-rw-r--r-- | gen.cpp | 11 | ||||
-rw-r--r-- | histogram.cpp | 46 | ||||
-rw-r--r-- | histogram.h | 53 | ||||
-rw-r--r-- | kgramstats.cpp | 43 | ||||
-rw-r--r-- | kgramstats.h | 3 |
7 files changed, 86 insertions, 88 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b73fb0..b35f630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
@@ -15,7 +15,7 @@ include_directories( | |||
15 | ${ASPELL_INCLUDE_DIR} | 15 | ${ASPELL_INCLUDE_DIR} |
16 | ${yaml-cpp_INCLUDE_DIRS}) | 16 | ${yaml-cpp_INCLUDE_DIRS}) |
17 | 17 | ||
18 | add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp) | 18 | add_library(rawr kgramstats.cpp prefix_search.cpp) |
19 | set_property(TARGET rawr PROPERTY CXX_STANDARD 11) | 19 | set_property(TARGET rawr PROPERTY CXX_STANDARD 11) |
20 | set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) | 20 | set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) |
21 | target_link_libraries(rawr ${ASPELL_LIBRARIES}) | 21 | target_link_libraries(rawr ${ASPELL_LIBRARIES}) |
diff --git a/ebooks.cpp b/ebooks.cpp index 0644132..3918b78 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -2,8 +2,6 @@ | |||
2 | #include <list> | 2 | #include <list> |
3 | #include <map> | 3 | #include <map> |
4 | #include "kgramstats.h" | 4 | #include "kgramstats.h" |
5 | #include <ctime> | ||
6 | #include <cstdlib> | ||
7 | #include <fstream> | 5 | #include <fstream> |
8 | #include <iostream> | 6 | #include <iostream> |
9 | #include <twitter.h> | 7 | #include <twitter.h> |
@@ -11,14 +9,15 @@ | |||
11 | #include <thread> | 9 | #include <thread> |
12 | #include <chrono> | 10 | #include <chrono> |
13 | #include <algorithm> | 11 | #include <algorithm> |
12 | #include <random> | ||
14 | 13 | ||
15 | const auto QUEUE_TIMEOUT = std::chrono::minutes(1); | 14 | const auto QUEUE_TIMEOUT = std::chrono::minutes(1); |
16 | const auto POLL_TIMEOUT = std::chrono::minutes(5); | 15 | const auto POLL_TIMEOUT = std::chrono::minutes(5); |
17 | 16 | ||
18 | int main(int argc, char** args) | 17 | int main(int argc, char** args) |
19 | { | 18 | { |
20 | srand(time(NULL)); | 19 | std::random_device randomDevice; |
21 | rand(); rand(); rand(); rand(); | 20 | std::mt19937 rng(randomDevice()); |
22 | 21 | ||
23 | YAML::Node config = YAML::LoadFile("config.yml"); | 22 | YAML::Node config = YAML::LoadFile("config.yml"); |
24 | int delay = config["delay"].as<int>(); | 23 | int delay = config["delay"].as<int>(); |
@@ -72,7 +71,8 @@ int main(int argc, char** args) | |||
72 | size_t pos = form.find("$name$"); | 71 | size_t pos = form.find("$name$"); |
73 | if (pos != std::string::npos) | 72 | if (pos != std::string::npos) |
74 | { | 73 | { |
75 | form.replace(pos, 6, fv_names[rand() % fv_names.size()]); | 74 | int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng); |
75 | form.replace(pos, 6, fv_names[fvInd]); | ||
76 | } | 76 | } |
77 | 77 | ||
78 | return form; | 78 | return form; |
@@ -92,12 +92,12 @@ int main(int argc, char** args) | |||
92 | 92 | ||
93 | if (currentTime >= genTimer) | 93 | if (currentTime >= genTimer) |
94 | { | 94 | { |
95 | std::string doc = kgramstats.randomSentence(140); | 95 | std::string doc = kgramstats.randomSentence(140, rng); |
96 | doc.resize(140); | 96 | doc.resize(140); |
97 | 97 | ||
98 | postQueue.emplace_back(std::move(doc), false, 0); | 98 | postQueue.emplace_back(std::move(doc), false, 0); |
99 | 99 | ||
100 | int genwait = rand() % delay + 1; | 100 | int genwait = std::uniform_int_distribution<int>(1, delay)(rng); |
101 | 101 | ||
102 | genTimer = currentTime + std::chrono::seconds(genwait); | 102 | genTimer = currentTime + std::chrono::seconds(genwait); |
103 | } | 103 | } |
@@ -125,7 +125,7 @@ int main(int argc, char** args) | |||
125 | && tweet.getAuthor() != client.getUser()) | 125 | && tweet.getAuthor() != client.getUser()) |
126 | { | 126 | { |
127 | std::string doc = tweet.generateReplyPrefill(client.getUser()); | 127 | std::string doc = tweet.generateReplyPrefill(client.getUser()); |
128 | doc += kgramstats.randomSentence(140 - doc.length()); | 128 | doc += kgramstats.randomSentence(140 - doc.length(), rng); |
129 | doc.resize(140); | 129 | doc.resize(140); |
130 | 130 | ||
131 | postQueue.emplace_back(std::move(doc), true, tweet.getID()); | 131 | postQueue.emplace_back(std::move(doc), true, tweet.getID()); |
diff --git a/gen.cpp b/gen.cpp index 4e19f84..952e3b5 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -2,15 +2,15 @@ | |||
2 | #include <list> | 2 | #include <list> |
3 | #include <map> | 3 | #include <map> |
4 | #include "kgramstats.h" | 4 | #include "kgramstats.h" |
5 | #include <ctime> | ||
6 | #include <vector> | 5 | #include <vector> |
7 | #include <cstdlib> | ||
8 | #include <fstream> | 6 | #include <fstream> |
9 | #include <iostream> | 7 | #include <iostream> |
8 | #include <random> | ||
10 | 9 | ||
11 | int main(int argc, char** args) | 10 | int main(int argc, char** args) |
12 | { | 11 | { |
13 | srand(time(NULL)); | 12 | std::random_device randomDevice; |
13 | std::mt19937 rng(randomDevice()); | ||
14 | 14 | ||
15 | if (argc == 1) | 15 | if (argc == 1) |
16 | { | 16 | { |
@@ -73,7 +73,8 @@ int main(int argc, char** args) | |||
73 | size_t pos = form.find("$name$"); | 73 | size_t pos = form.find("$name$"); |
74 | if (pos != std::string::npos) | 74 | if (pos != std::string::npos) |
75 | { | 75 | { |
76 | form.replace(pos, 6, fv_names[rand() % fv_names.size()]); | 76 | int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng); |
77 | form.replace(pos, 6, fv_names[fvInd]); | ||
77 | } | 78 | } |
78 | 79 | ||
79 | return form; | 80 | return form; |
@@ -82,7 +83,7 @@ int main(int argc, char** args) | |||
82 | std::cout << "Generating..." << std::endl; | 83 | std::cout << "Generating..." << std::endl; |
83 | for (;;) | 84 | for (;;) |
84 | { | 85 | { |
85 | std::string doc = kgramstats.randomSentence(140); | 86 | std::string doc = kgramstats.randomSentence(140, rng); |
86 | doc.resize(140); | 87 | doc.resize(140); |
87 | 88 | ||
88 | std::cout << doc << std::endl; | 89 | std::cout << doc << std::endl; |
diff --git a/histogram.cpp b/histogram.cpp deleted file mode 100644 index 77c5c3e..0000000 --- a/histogram.cpp +++ /dev/null | |||
@@ -1,46 +0,0 @@ | |||
1 | #include "histogram.h" | ||
2 | #include "kgramstats.h" | ||
3 | #include <cstdlib> | ||
4 | #include <iostream> | ||
5 | |||
6 | template <class T> | ||
7 | void histogram<T>::add(const T& inst) | ||
8 | { | ||
9 | freqtable[inst]++; | ||
10 | } | ||
11 | |||
12 | template <class T> | ||
13 | void histogram<T>::compile() | ||
14 | { | ||
15 | distribution.clear(); | ||
16 | |||
17 | int max = 0; | ||
18 | for (auto& it : freqtable) | ||
19 | { | ||
20 | max += it.second; | ||
21 | distribution.emplace(max, it.first); | ||
22 | } | ||
23 | |||
24 | freqtable.clear(); | ||
25 | } | ||
26 | |||
27 | template <class T> | ||
28 | const T& histogram<T>::next() const | ||
29 | { | ||
30 | int max = distribution.rbegin()->first; | ||
31 | int r = rand() % max; | ||
32 | |||
33 | return distribution.upper_bound(r)->second; | ||
34 | } | ||
35 | |||
36 | template <class T> | ||
37 | void histogram<T>::print() const | ||
38 | { | ||
39 | for (auto& freqpair : freqtable) | ||
40 | { | ||
41 | std::cout << freqpair.first << ": " << freqpair.second << std::endl; | ||
42 | } | ||
43 | } | ||
44 | |||
45 | template class histogram <std::string>; | ||
46 | template class histogram <rawr::terminator>; | ||
diff --git a/histogram.h b/histogram.h index 76d8f1b..c7e051b 100644 --- a/histogram.h +++ b/histogram.h | |||
@@ -3,18 +3,53 @@ | |||
3 | 3 | ||
4 | #include <map> | 4 | #include <map> |
5 | #include <string> | 5 | #include <string> |
6 | #include <random> | ||
7 | #include <iostream> | ||
6 | 8 | ||
7 | template <class T> | 9 | template <class T> |
8 | class histogram { | 10 | class histogram { |
9 | public: | 11 | public: |
10 | void add(const T& inst); | 12 | |
11 | void compile(); | 13 | void add(const T& inst) |
12 | const T& next() const; | 14 | { |
13 | void print() const; | 15 | freqtable_[inst]++; |
14 | 16 | } | |
15 | private: | 17 | |
16 | std::map<T, int> freqtable; | 18 | void compile() |
17 | std::map<int, T> distribution; | 19 | { |
20 | distribution_.clear(); | ||
21 | |||
22 | int max = 0; | ||
23 | for (auto& it : freqtable_) | ||
24 | { | ||
25 | max += it.second; | ||
26 | distribution_.emplace(max, it.first); | ||
27 | } | ||
28 | |||
29 | freqtable_.clear(); | ||
30 | } | ||
31 | |||
32 | const T& next(std::mt19937& rng) const | ||
33 | { | ||
34 | int max = distribution_.rbegin()->first; | ||
35 | std::uniform_int_distribution<int> randDist(0, max - 1); | ||
36 | int r = randDist(rng); | ||
37 | |||
38 | return distribution_.upper_bound(r)->second; | ||
39 | } | ||
40 | |||
41 | void print() const | ||
42 | { | ||
43 | for (auto& freqpair : freqtable_) | ||
44 | { | ||
45 | std::cout << freqpair.first << ": " << freqpair.second << std::endl; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | private: | ||
50 | |||
51 | std::map<T, int> freqtable_; | ||
52 | std::map<int, T> distribution_; | ||
18 | }; | 53 | }; |
19 | 54 | ||
20 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | 55 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ |
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0a83dc..6148dd3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -590,7 +590,7 @@ void rawr::setMinCorpora(int _arg) | |||
590 | } | 590 | } |
591 | 591 | ||
592 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 592 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
593 | std::string rawr::randomSentence(int maxL) const | 593 | std::string rawr::randomSentence(int maxL, std::mt19937& rng) const |
594 | { | 594 | { |
595 | if (!_compiled) | 595 | if (!_compiled) |
596 | { | 596 | { |
@@ -610,16 +610,13 @@ std::string rawr::randomSentence(int maxL) const | |||
610 | cur.pop_front(); | 610 | cur.pop_front(); |
611 | } | 611 | } |
612 | 612 | ||
613 | do | 613 | while (cur.size() > 2 && |
614 | cuts > 0 && | ||
615 | !std::bernoulli_distribution(1.0 / static_cast<double>(cuts))(rng)) | ||
614 | { | 616 | { |
615 | if ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)) | ||
616 | { | ||
617 | cur.pop_front(); | 617 | cur.pop_front(); |
618 | cuts--; | 618 | cuts--; |
619 | } else { | 619 | } |
620 | break; | ||
621 | } | ||
622 | } while ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0)); | ||
623 | 620 | ||
624 | // Gotta circumvent the last line of the input corpus | 621 | // Gotta circumvent the last line of the input corpus |
625 | // https://twitter.com/starla4444/status/684222271339237376 | 622 | // https://twitter.com/starla4444/status/684222271339237376 |
@@ -627,7 +624,8 @@ std::string rawr::randomSentence(int maxL) const | |||
627 | { | 624 | { |
628 | // The end of a corpus should probably be treated like a terminator, so | 625 | // The end of a corpus should probably be treated like a terminator, so |
629 | // maybe we should just end here. | 626 | // maybe we should just end here. |
630 | if ((result.length() > maxL) || (rand() % 4 == 0)) | 627 | if (result.length() > maxL || |
628 | std::bernoulli_distribution(1.0 / 4.0)(rng)) | ||
631 | { | 629 | { |
632 | break; | 630 | break; |
633 | } | 631 | } |
@@ -637,10 +635,11 @@ std::string rawr::randomSentence(int maxL) const | |||
637 | 635 | ||
638 | auto& distribution = _stats.at(cur); | 636 | auto& distribution = _stats.at(cur); |
639 | int max = distribution.rbegin()->first; | 637 | int max = distribution.rbegin()->first; |
640 | int r = rand() % max; | 638 | std::uniform_int_distribution<int> randDist(0, max - 1); |
639 | int r = randDist(rng); | ||
641 | const token_data& next = distribution.upper_bound(r)->second; | 640 | const token_data& next = distribution.upper_bound(r)->second; |
642 | const token& interned = _tokenstore.get(next.tok); | 641 | const token& interned = _tokenstore.get(next.tok); |
643 | std::string nextToken = interned.w.forms.next(); | 642 | std::string nextToken = interned.w.forms.next(rng); |
644 | 643 | ||
645 | // Apply user-specified transforms | 644 | // Apply user-specified transforms |
646 | if (_transform) | 645 | if (_transform) |
@@ -651,10 +650,16 @@ std::string rawr::randomSentence(int maxL) const | |||
651 | // Determine the casing of the next token. We randomly make the token all | 650 | // Determine the casing of the next token. We randomly make the token all |
652 | // caps based on the markov chain. Otherwise, we check if the previous | 651 | // caps based on the markov chain. Otherwise, we check if the previous |
653 | // token is the end of a sentence (terminating token or a wildcard query). | 652 | // token is the end of a sentence (terminating token or a wildcard query). |
654 | int casing = rand() % next.all; | 653 | std::uniform_int_distribution<int> caseDist(0, next.all - 1); |
654 | int casing = caseDist(rng); | ||
655 | |||
655 | if (casing < next.uppercase) | 656 | if (casing < next.uppercase) |
656 | { | 657 | { |
657 | std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | 658 | std::transform( |
659 | std::begin(nextToken), | ||
660 | std::end(nextToken), | ||
661 | std::begin(nextToken), | ||
662 | ::toupper); | ||
658 | } else { | 663 | } else { |
659 | bool capitalize = false; | 664 | bool capitalize = false; |
660 | 665 | ||
@@ -663,7 +668,7 @@ std::string rawr::randomSentence(int maxL) const | |||
663 | capitalize = true; | 668 | capitalize = true; |
664 | } else if (cur.rbegin()->type == querytype::sentence) | 669 | } else if (cur.rbegin()->type == querytype::sentence) |
665 | { | 670 | { |
666 | if (rand() % 2 > 0) | 671 | if (std::bernoulli_distribution(1.0 / 2.0)(rng)) |
667 | { | 672 | { |
668 | capitalize = true; | 673 | capitalize = true; |
669 | } | 674 | } |
@@ -671,7 +676,7 @@ std::string rawr::randomSentence(int maxL) const | |||
671 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); | 676 | const token& lastTok = _tokenstore.get(cur.rbegin()->tok); |
672 | 677 | ||
673 | if (lastTok.suffix == suffixtype::terminating && | 678 | if (lastTok.suffix == suffixtype::terminating && |
674 | rand() % 2 > 0) | 679 | std::bernoulli_distribution(1.0 / 2.0)(rng)) |
675 | { | 680 | { |
676 | capitalize = true; | 681 | capitalize = true; |
677 | } | 682 | } |
@@ -753,7 +758,7 @@ std::string rawr::randomSentence(int maxL) const | |||
753 | // Terminators | 758 | // Terminators |
754 | if (interned.suffix == suffixtype::terminating) | 759 | if (interned.suffix == suffixtype::terminating) |
755 | { | 760 | { |
756 | auto term = interned.w.terms.next(); | 761 | auto term = interned.w.terms.next(rng); |
757 | nextToken.append(term.form); | 762 | nextToken.append(term.form); |
758 | 763 | ||
759 | if (term.newline) | 764 | if (term.newline) |
@@ -794,7 +799,9 @@ std::string rawr::randomSentence(int maxL) const | |||
794 | cur.push_back(next.tok); | 799 | cur.push_back(next.tok); |
795 | result.append(nextToken); | 800 | result.append(nextToken); |
796 | 801 | ||
797 | if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) | 802 | if (interned.suffix == suffixtype::terminating && |
803 | (result.length() > maxL || | ||
804 | std::bernoulli_distribution(1.0 / 4.0)(rng))) | ||
798 | { | 805 | { |
799 | break; | 806 | break; |
800 | } | 807 | } |
@@ -803,7 +810,7 @@ std::string rawr::randomSentence(int maxL) const | |||
803 | // Ensure that enough corpora are used | 810 | // Ensure that enough corpora are used |
804 | if (used_corpora.size() < _min_corpora) | 811 | if (used_corpora.size() < _min_corpora) |
805 | { | 812 | { |
806 | return randomSentence(maxL); | 813 | return randomSentence(maxL, rng); |
807 | } | 814 | } |
808 | 815 | ||
809 | // Remove the trailing space | 816 | // Remove the trailing space |
diff --git a/kgramstats.h b/kgramstats.h index 49fe04e..848af24 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -9,6 +9,7 @@ | |||
9 | #include "identifier.h" | 9 | #include "identifier.h" |
10 | #include <functional> | 10 | #include <functional> |
11 | #include <set> | 11 | #include <set> |
12 | #include <random> | ||
12 | 13 | ||
13 | class rawr { | 14 | class rawr { |
14 | public: | 15 | public: |
@@ -19,7 +20,7 @@ class rawr { | |||
19 | 20 | ||
20 | void setTransformCallback(transform_callback _arg); | 21 | void setTransformCallback(transform_callback _arg); |
21 | void setMinCorpora(int _arg); | 22 | void setMinCorpora(int _arg); |
22 | std::string randomSentence(int maxL) const; | 23 | std::string randomSentence(int maxL, std::mt19937& rng) const; |
23 | 24 | ||
24 | private: | 25 | private: |
25 | struct terminator { | 26 | struct terminator { |