about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-03-08 14:37:16 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-03-08 14:37:16 -0500
commit73821856c9648d030f4d148d2bc50f07f43ad369 (patch)
tree98f554e8cd55c859b71d5b2de5b9762baae6d563
parenta791091a4da2335ee45f3716cfe68466e5ebd679 (diff)
downloadrawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.gz
rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.bz2
rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.zip
Full sentences mode!
-rw-r--r--ebooks.cpp2
-rw-r--r--gen.cpp2
-rw-r--r--kgramstats.cpp17
-rw-r--r--kgramstats.h2
4 files changed, 18 insertions, 5 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 7d2724c..b586d63 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -43,7 +43,7 @@ int main(int argc, char** args)
43 std::cout << "Generating..." << std::endl; 43 std::cout << "Generating..." << std::endl;
44 for (;;) 44 for (;;)
45 { 45 {
46 std::string doc = stats->randomSentence(rand() % 45 + 5); 46 std::string doc = stats->randomSentence(140);
47 std::string hi = doc; 47 std::string hi = doc;
48 hi.resize(140); 48 hi.resize(140);
49 49
diff --git a/gen.cpp b/gen.cpp index a963740..0319283 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -51,7 +51,7 @@ int main(int argc, char** args)
51 std::cout << "Generating..." << std::endl; 51 std::cout << "Generating..." << std::endl;
52 for (;;) 52 for (;;)
53 { 53 {
54 std::string doc = stats->randomSentence(rand() % 35 + 15); 54 std::string doc = stats->randomSentence(140);
55 std::string hi = doc; 55 std::string hi = doc;
56 hi.resize(140); 56 hi.resize(140);
57 57
diff --git a/kgramstats.cpp b/kgramstats.cpp index 933165a..899ad20 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -457,14 +457,14 @@ void printKgram(kgram k)
457} 457}
458 458
459// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus 459// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
460std::string kgramstats::randomSentence(int n) 460std::string kgramstats::randomSentence(int max)
461{ 461{
462 std::string result; 462 std::string result;
463 kgram cur(1, wildcardQuery); 463 kgram cur(1, wildcardQuery);
464 int cuts = 0; 464 int cuts = 0;
465 std::stack<parentype> open_delimiters; 465 std::stack<parentype> open_delimiters;
466 466
467 for (int i=0; i<n; i++) 467 for (;;)
468 { 468 {
469 if (cur.size() == maxK) 469 if (cur.size() == maxK)
470 { 470 {
@@ -611,6 +611,19 @@ std::string kgramstats::randomSentence(int n)
611 { 611 {
612 break; 612 break;
613 } 613 }
614
615 // Went over the limit, so reset
616 if (result.length() > max)
617 {
618 result = "";
619 cur = kgram(1, wildcardQuery);
620 cuts = 0;
621
622 while (!open_delimiters.empty())
623 {
624 open_delimiters.pop();
625 }
626 }
614 } 627 }
615 628
616 // Remove the trailing space 629 // Remove the trailing space
diff --git a/kgramstats.h b/kgramstats.h index 4acde65..a024184 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -97,7 +97,7 @@ class kgramstats
97{ 97{
98public: 98public:
99 kgramstats(std::string corpus, int maxK); 99 kgramstats(std::string corpus, int maxK);
100 std::string randomSentence(int n); 100 std::string randomSentence(int max);
101 101
102private: 102private:
103 struct token_data 103 struct token_data