diff options
-rw-r--r-- | ebooks.cpp | 2 | ||||
-rw-r--r-- | gen.cpp | 2 | ||||
-rw-r--r-- | kgramstats.cpp | 17 | ||||
-rw-r--r-- | kgramstats.h | 2 |
4 files changed, 18 insertions, 5 deletions
diff --git a/ebooks.cpp b/ebooks.cpp index 7d2724c..b586d63 100644 --- a/ebooks.cpp +++ b/ebooks.cpp | |||
@@ -43,7 +43,7 @@ int main(int argc, char** args) | |||
43 | std::cout << "Generating..." << std::endl; | 43 | std::cout << "Generating..." << std::endl; |
44 | for (;;) | 44 | for (;;) |
45 | { | 45 | { |
46 | std::string doc = stats->randomSentence(rand() % 45 + 5); | 46 | std::string doc = stats->randomSentence(140); |
47 | std::string hi = doc; | 47 | std::string hi = doc; |
48 | hi.resize(140); | 48 | hi.resize(140); |
49 | 49 | ||
diff --git a/gen.cpp b/gen.cpp index a963740..0319283 100644 --- a/gen.cpp +++ b/gen.cpp | |||
@@ -51,7 +51,7 @@ int main(int argc, char** args) | |||
51 | std::cout << "Generating..." << std::endl; | 51 | std::cout << "Generating..." << std::endl; |
52 | for (;;) | 52 | for (;;) |
53 | { | 53 | { |
54 | std::string doc = stats->randomSentence(rand() % 35 + 15); | 54 | std::string doc = stats->randomSentence(140); |
55 | std::string hi = doc; | 55 | std::string hi = doc; |
56 | hi.resize(140); | 56 | hi.resize(140); |
57 | 57 | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 933165a..899ad20 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -457,14 +457,14 @@ void printKgram(kgram k) | |||
457 | } | 457 | } |
458 | 458 | ||
459 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 459 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
460 | std::string kgramstats::randomSentence(int n) | 460 | std::string kgramstats::randomSentence(int max) |
461 | { | 461 | { |
462 | std::string result; | 462 | std::string result; |
463 | kgram cur(1, wildcardQuery); | 463 | kgram cur(1, wildcardQuery); |
464 | int cuts = 0; | 464 | int cuts = 0; |
465 | std::stack<parentype> open_delimiters; | 465 | std::stack<parentype> open_delimiters; |
466 | 466 | ||
467 | for (int i=0; i<n; i++) | 467 | for (;;) |
468 | { | 468 | { |
469 | if (cur.size() == maxK) | 469 | if (cur.size() == maxK) |
470 | { | 470 | { |
@@ -611,6 +611,19 @@ std::string kgramstats::randomSentence(int n) | |||
611 | { | 611 | { |
612 | break; | 612 | break; |
613 | } | 613 | } |
614 | |||
615 | // Went over the limit, so reset | ||
616 | if (result.length() > max) | ||
617 | { | ||
618 | result = ""; | ||
619 | cur = kgram(1, wildcardQuery); | ||
620 | cuts = 0; | ||
621 | |||
622 | while (!open_delimiters.empty()) | ||
623 | { | ||
624 | open_delimiters.pop(); | ||
625 | } | ||
626 | } | ||
614 | } | 627 | } |
615 | 628 | ||
616 | // Remove the trailing space | 629 | // Remove the trailing space |
diff --git a/kgramstats.h b/kgramstats.h index 4acde65..a024184 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -97,7 +97,7 @@ class kgramstats | |||
97 | { | 97 | { |
98 | public: | 98 | public: |
99 | kgramstats(std::string corpus, int maxK); | 99 | kgramstats(std::string corpus, int maxK); |
100 | std::string randomSentence(int n); | 100 | std::string randomSentence(int max); |
101 | 101 | ||
102 | private: | 102 | private: |
103 | struct token_data | 103 | struct token_data |