about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt2
-rw-r--r--ebooks.cpp16
-rw-r--r--gen.cpp11
-rw-r--r--histogram.cpp46
-rw-r--r--histogram.h53
-rw-r--r--kgramstats.cpp43
-rw-r--r--kgramstats.h3
7 files changed, 86 insertions, 88 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b73fb0..b35f630 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ include_directories(
15 ${ASPELL_INCLUDE_DIR} 15 ${ASPELL_INCLUDE_DIR}
16 ${yaml-cpp_INCLUDE_DIRS}) 16 ${yaml-cpp_INCLUDE_DIRS})
17 17
18add_library(rawr kgramstats.cpp histogram.cpp prefix_search.cpp) 18add_library(rawr kgramstats.cpp prefix_search.cpp)
19set_property(TARGET rawr PROPERTY CXX_STANDARD 11) 19set_property(TARGET rawr PROPERTY CXX_STANDARD 11)
20set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON) 20set_property(TARGET rawr PROPERTY CXX_STANDARD_REQUIRED ON)
21target_link_libraries(rawr ${ASPELL_LIBRARIES}) 21target_link_libraries(rawr ${ASPELL_LIBRARIES})
diff --git a/ebooks.cpp b/ebooks.cpp index 0644132..3918b78 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -2,8 +2,6 @@
2#include <list> 2#include <list>
3#include <map> 3#include <map>
4#include "kgramstats.h" 4#include "kgramstats.h"
5#include <ctime>
6#include <cstdlib>
7#include <fstream> 5#include <fstream>
8#include <iostream> 6#include <iostream>
9#include <twitter.h> 7#include <twitter.h>
@@ -11,14 +9,15 @@
11#include <thread> 9#include <thread>
12#include <chrono> 10#include <chrono>
13#include <algorithm> 11#include <algorithm>
12#include <random>
14 13
15const auto QUEUE_TIMEOUT = std::chrono::minutes(1); 14const auto QUEUE_TIMEOUT = std::chrono::minutes(1);
16const auto POLL_TIMEOUT = std::chrono::minutes(5); 15const auto POLL_TIMEOUT = std::chrono::minutes(5);
17 16
18int main(int argc, char** args) 17int main(int argc, char** args)
19{ 18{
20 srand(time(NULL)); 19 std::random_device randomDevice;
21 rand(); rand(); rand(); rand(); 20 std::mt19937 rng(randomDevice());
22 21
23 YAML::Node config = YAML::LoadFile("config.yml"); 22 YAML::Node config = YAML::LoadFile("config.yml");
24 int delay = config["delay"].as<int>(); 23 int delay = config["delay"].as<int>();
@@ -72,7 +71,8 @@ int main(int argc, char** args)
72 size_t pos = form.find("$name$"); 71 size_t pos = form.find("$name$");
73 if (pos != std::string::npos) 72 if (pos != std::string::npos)
74 { 73 {
75 form.replace(pos, 6, fv_names[rand() % fv_names.size()]); 74 int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng);
75 form.replace(pos, 6, fv_names[fvInd]);
76 } 76 }
77 77
78 return form; 78 return form;
@@ -92,12 +92,12 @@ int main(int argc, char** args)
92 92
93 if (currentTime >= genTimer) 93 if (currentTime >= genTimer)
94 { 94 {
95 std::string doc = kgramstats.randomSentence(140); 95 std::string doc = kgramstats.randomSentence(140, rng);
96 doc.resize(140); 96 doc.resize(140);
97 97
98 postQueue.emplace_back(std::move(doc), false, 0); 98 postQueue.emplace_back(std::move(doc), false, 0);
99 99
100 int genwait = rand() % delay + 1; 100 int genwait = std::uniform_int_distribution<int>(1, delay)(rng);
101 101
102 genTimer = currentTime + std::chrono::seconds(genwait); 102 genTimer = currentTime + std::chrono::seconds(genwait);
103 } 103 }
@@ -125,7 +125,7 @@ int main(int argc, char** args)
125 && tweet.getAuthor() != client.getUser()) 125 && tweet.getAuthor() != client.getUser())
126 { 126 {
127 std::string doc = tweet.generateReplyPrefill(client.getUser()); 127 std::string doc = tweet.generateReplyPrefill(client.getUser());
128 doc += kgramstats.randomSentence(140 - doc.length()); 128 doc += kgramstats.randomSentence(140 - doc.length(), rng);
129 doc.resize(140); 129 doc.resize(140);
130 130
131 postQueue.emplace_back(std::move(doc), true, tweet.getID()); 131 postQueue.emplace_back(std::move(doc), true, tweet.getID());
diff --git a/gen.cpp b/gen.cpp index 4e19f84..952e3b5 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -2,15 +2,15 @@
2#include <list> 2#include <list>
3#include <map> 3#include <map>
4#include "kgramstats.h" 4#include "kgramstats.h"
5#include <ctime>
6#include <vector> 5#include <vector>
7#include <cstdlib>
8#include <fstream> 6#include <fstream>
9#include <iostream> 7#include <iostream>
8#include <random>
10 9
11int main(int argc, char** args) 10int main(int argc, char** args)
12{ 11{
13 srand(time(NULL)); 12 std::random_device randomDevice;
13 std::mt19937 rng(randomDevice());
14 14
15 if (argc == 1) 15 if (argc == 1)
16 { 16 {
@@ -73,7 +73,8 @@ int main(int argc, char** args)
73 size_t pos = form.find("$name$"); 73 size_t pos = form.find("$name$");
74 if (pos != std::string::npos) 74 if (pos != std::string::npos)
75 { 75 {
76 form.replace(pos, 6, fv_names[rand() % fv_names.size()]); 76 int fvInd = std::uniform_int_distribution<int>(0, fv_names.size()-1)(rng);
77 form.replace(pos, 6, fv_names[fvInd]);
77 } 78 }
78 79
79 return form; 80 return form;
@@ -82,7 +83,7 @@ int main(int argc, char** args)
82 std::cout << "Generating..." << std::endl; 83 std::cout << "Generating..." << std::endl;
83 for (;;) 84 for (;;)
84 { 85 {
85 std::string doc = kgramstats.randomSentence(140); 86 std::string doc = kgramstats.randomSentence(140, rng);
86 doc.resize(140); 87 doc.resize(140);
87 88
88 std::cout << doc << std::endl; 89 std::cout << doc << std::endl;
diff --git a/histogram.cpp b/histogram.cpp deleted file mode 100644 index 77c5c3e..0000000 --- a/histogram.cpp +++ /dev/null
@@ -1,46 +0,0 @@
1#include "histogram.h"
2#include "kgramstats.h"
3#include <cstdlib>
4#include <iostream>
5
6template <class T>
7void histogram<T>::add(const T& inst)
8{
9 freqtable[inst]++;
10}
11
12template <class T>
13void histogram<T>::compile()
14{
15 distribution.clear();
16
17 int max = 0;
18 for (auto& it : freqtable)
19 {
20 max += it.second;
21 distribution.emplace(max, it.first);
22 }
23
24 freqtable.clear();
25}
26
27template <class T>
28const T& histogram<T>::next() const
29{
30 int max = distribution.rbegin()->first;
31 int r = rand() % max;
32
33 return distribution.upper_bound(r)->second;
34}
35
36template <class T>
37void histogram<T>::print() const
38{
39 for (auto& freqpair : freqtable)
40 {
41 std::cout << freqpair.first << ": " << freqpair.second << std::endl;
42 }
43}
44
45template class histogram <std::string>;
46template class histogram <rawr::terminator>;
diff --git a/histogram.h b/histogram.h index 76d8f1b..c7e051b 100644 --- a/histogram.h +++ b/histogram.h
@@ -3,18 +3,53 @@
3 3
4#include <map> 4#include <map>
5#include <string> 5#include <string>
6#include <random>
7#include <iostream>
6 8
7template <class T> 9template <class T>
8class histogram { 10class histogram {
9 public: 11public:
10 void add(const T& inst); 12
11 void compile(); 13 void add(const T& inst)
12 const T& next() const; 14 {
13 void print() const; 15 freqtable_[inst]++;
14 16 }
15 private: 17
16 std::map<T, int> freqtable; 18 void compile()
17 std::map<int, T> distribution; 19 {
20 distribution_.clear();
21
22 int max = 0;
23 for (auto& it : freqtable_)
24 {
25 max += it.second;
26 distribution_.emplace(max, it.first);
27 }
28
29 freqtable_.clear();
30 }
31
32 const T& next(std::mt19937& rng) const
33 {
34 int max = distribution_.rbegin()->first;
35 std::uniform_int_distribution<int> randDist(0, max - 1);
36 int r = randDist(rng);
37
38 return distribution_.upper_bound(r)->second;
39 }
40
41 void print() const
42 {
43 for (auto& freqpair : freqtable_)
44 {
45 std::cout << freqpair.first << ": " << freqpair.second << std::endl;
46 }
47 }
48
49private:
50
51 std::map<T, int> freqtable_;
52 std::map<int, T> distribution_;
18}; 53};
19 54
20#endif /* end of include guard: HISTOGRAM_H_24094D97 */ 55#endif /* end of include guard: HISTOGRAM_H_24094D97 */
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0a83dc..6148dd3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -590,7 +590,7 @@ void rawr::setMinCorpora(int _arg)
590} 590}
591 591
592// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 592// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
593std::string rawr::randomSentence(int maxL) const 593std::string rawr::randomSentence(int maxL, std::mt19937& rng) const
594{ 594{
595 if (!_compiled) 595 if (!_compiled)
596 { 596 {
@@ -610,16 +610,13 @@ std::string rawr::randomSentence(int maxL) const
610 cur.pop_front(); 610 cur.pop_front();
611 } 611 }
612 612
613 do 613 while (cur.size() > 2 &&
614 cuts > 0 &&
615 !std::bernoulli_distribution(1.0 / static_cast<double>(cuts))(rng))
614 { 616 {
615 if ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0))
616 {
617 cur.pop_front(); 617 cur.pop_front();
618 cuts--; 618 cuts--;
619 } else { 619 }
620 break;
621 }
622 } while ((cur.size() > 2) && (cuts > 0) && ((rand() % cuts) > 0));
623 620
624 // Gotta circumvent the last line of the input corpus 621 // Gotta circumvent the last line of the input corpus
625 // https://twitter.com/starla4444/status/684222271339237376 622 // https://twitter.com/starla4444/status/684222271339237376
@@ -627,7 +624,8 @@ std::string rawr::randomSentence(int maxL) const
627 { 624 {
628 // The end of a corpus should probably be treated like a terminator, so 625 // The end of a corpus should probably be treated like a terminator, so
629 // maybe we should just end here. 626 // maybe we should just end here.
630 if ((result.length() > maxL) || (rand() % 4 == 0)) 627 if (result.length() > maxL ||
628 std::bernoulli_distribution(1.0 / 4.0)(rng))
631 { 629 {
632 break; 630 break;
633 } 631 }
@@ -637,10 +635,11 @@ std::string rawr::randomSentence(int maxL) const
637 635
638 auto& distribution = _stats.at(cur); 636 auto& distribution = _stats.at(cur);
639 int max = distribution.rbegin()->first; 637 int max = distribution.rbegin()->first;
640 int r = rand() % max; 638 std::uniform_int_distribution<int> randDist(0, max - 1);
639 int r = randDist(rng);
641 const token_data& next = distribution.upper_bound(r)->second; 640 const token_data& next = distribution.upper_bound(r)->second;
642 const token& interned = _tokenstore.get(next.tok); 641 const token& interned = _tokenstore.get(next.tok);
643 std::string nextToken = interned.w.forms.next(); 642 std::string nextToken = interned.w.forms.next(rng);
644 643
645 // Apply user-specified transforms 644 // Apply user-specified transforms
646 if (_transform) 645 if (_transform)
@@ -651,10 +650,16 @@ std::string rawr::randomSentence(int maxL) const
651 // Determine the casing of the next token. We randomly make the token all 650 // Determine the casing of the next token. We randomly make the token all
652 // caps based on the markov chain. Otherwise, we check if the previous 651 // caps based on the markov chain. Otherwise, we check if the previous
653 // token is the end of a sentence (terminating token or a wildcard query). 652 // token is the end of a sentence (terminating token or a wildcard query).
654 int casing = rand() % next.all; 653 std::uniform_int_distribution<int> caseDist(0, next.all - 1);
654 int casing = caseDist(rng);
655
655 if (casing < next.uppercase) 656 if (casing < next.uppercase)
656 { 657 {
657 std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 658 std::transform(
659 std::begin(nextToken),
660 std::end(nextToken),
661 std::begin(nextToken),
662 ::toupper);
658 } else { 663 } else {
659 bool capitalize = false; 664 bool capitalize = false;
660 665
@@ -663,7 +668,7 @@ std::string rawr::randomSentence(int maxL) const
663 capitalize = true; 668 capitalize = true;
664 } else if (cur.rbegin()->type == querytype::sentence) 669 } else if (cur.rbegin()->type == querytype::sentence)
665 { 670 {
666 if (rand() % 2 > 0) 671 if (std::bernoulli_distribution(1.0 / 2.0)(rng))
667 { 672 {
668 capitalize = true; 673 capitalize = true;
669 } 674 }
@@ -671,7 +676,7 @@ std::string rawr::randomSentence(int maxL) const
671 const token& lastTok = _tokenstore.get(cur.rbegin()->tok); 676 const token& lastTok = _tokenstore.get(cur.rbegin()->tok);
672 677
673 if (lastTok.suffix == suffixtype::terminating && 678 if (lastTok.suffix == suffixtype::terminating &&
674 rand() % 2 > 0) 679 std::bernoulli_distribution(1.0 / 2.0)(rng))
675 { 680 {
676 capitalize = true; 681 capitalize = true;
677 } 682 }
@@ -753,7 +758,7 @@ std::string rawr::randomSentence(int maxL) const
753 // Terminators 758 // Terminators
754 if (interned.suffix == suffixtype::terminating) 759 if (interned.suffix == suffixtype::terminating)
755 { 760 {
756 auto term = interned.w.terms.next(); 761 auto term = interned.w.terms.next(rng);
757 nextToken.append(term.form); 762 nextToken.append(term.form);
758 763
759 if (term.newline) 764 if (term.newline)
@@ -794,7 +799,9 @@ std::string rawr::randomSentence(int maxL) const
794 cur.push_back(next.tok); 799 cur.push_back(next.tok);
795 result.append(nextToken); 800 result.append(nextToken);
796 801
797 if ((interned.suffix == suffixtype::terminating) && ((result.length() > maxL) || (rand() % 4 == 0))) 802 if (interned.suffix == suffixtype::terminating &&
803 (result.length() > maxL ||
804 std::bernoulli_distribution(1.0 / 4.0)(rng)))
798 { 805 {
799 break; 806 break;
800 } 807 }
@@ -803,7 +810,7 @@ std::string rawr::randomSentence(int maxL) const
803 // Ensure that enough corpora are used 810 // Ensure that enough corpora are used
804 if (used_corpora.size() < _min_corpora) 811 if (used_corpora.size() < _min_corpora)
805 { 812 {
806 return randomSentence(maxL); 813 return randomSentence(maxL, rng);
807 } 814 }
808 815
809 // Remove the trailing space 816 // Remove the trailing space
diff --git a/kgramstats.h b/kgramstats.h index 49fe04e..848af24 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -9,6 +9,7 @@
9#include "identifier.h" 9#include "identifier.h"
10#include <functional> 10#include <functional>
11#include <set> 11#include <set>
12#include <random>
12 13
13class rawr { 14class rawr {
14 public: 15 public:
@@ -19,7 +20,7 @@ class rawr {
19 20
20 void setTransformCallback(transform_callback _arg); 21 void setTransformCallback(transform_callback _arg);
21 void setMinCorpora(int _arg); 22 void setMinCorpora(int _arg);
22 std::string randomSentence(int maxL) const; 23 std::string randomSentence(int maxL, std::mt19937& rng) const;
23 24
24 private: 25 private:
25 struct terminator { 26 struct terminator {