Full sentences mode!

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-03-08 14:37:16 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-03-08 14:37:16 -0500
commit: 73821856c9648d030f4d148d2bc50f07f43ad369 (patch)
tree: 98f554e8cd55c859b71d5b2de5b9762baae6d563
parent: a791091a4da2335ee45f3716cfe68466e5ebd679 (diff)
download: rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.gz
rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.bz2
rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.zip
4 files changed, 18 insertions, 5 deletions
diff --git a/ebooks.cpp b/ebooks.cpp
index 7d2724c..b586d63 100644
--- a/ebooks.cpp
+++ b/ebooks.cpp

@@ -43,7 +43,7 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::string doc = stats->randomSentence(rand() % 45 + 5);
+    std::string doc = stats->randomSentence(140);
    std::string hi = doc;
    hi.resize(140);
    
diff --git a/gen.cpp b/gen.cpp
index a963740..0319283 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -51,7 +51,7 @@ int main(int argc, char** args)
  std::cout << "Generating..." << std::endl;
  for (;;)
  {
-    std::string doc = stats->randomSentence(rand() % 35 + 15);
+    std::string doc = stats->randomSentence(140);
    std::string hi = doc;
    hi.resize(140);
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 933165a..899ad20 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -457,14 +457,14 @@ void printKgram(kgram k)
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-std::string kgramstats::randomSentence(int n)
+std::string kgramstats::randomSentence(int max)
 {
  std::string result;
  kgram cur(1, wildcardQuery);
  int cuts = 0;
  std::stack<parentype> open_delimiters;
        
-  for (int i=0; i<n; i++)
+  for (;;)
  {
    if (cur.size() == maxK)
    {
@@ -611,6 +611,19 @@ std::string kgramstats::randomSentence(int n)
    {
      break;
    }
+    
+    // Went over the limit, so reset
+    if (result.length() > max)
+    {
+      result = "";
+      cur = kgram(1, wildcardQuery);
+      cuts = 0;
+      
+      while (!open_delimiters.empty())
+      {
+        open_delimiters.pop();
+      }
+    }
  }
  
  // Remove the trailing space
diff --git a/kgramstats.h b/kgramstats.h
index 4acde65..a024184 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -97,7 +97,7 @@ class kgramstats
 {
 public:
        kgramstats(std::string corpus, int maxK);
-        std::string randomSentence(int n);
+        std::string randomSentence(int max);
        
 private:
        struct token_data
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-03-08 14:37:16 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-03-08 14:37:16 -0500
commit	73821856c9648d030f4d148d2bc50f07f43ad369 (patch)
tree	98f554e8cd55c859b71d5b2de5b9762baae6d563
parent	a791091a4da2335ee45f3716cfe68466e5ebd679 (diff)
download	rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.gz rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.tar.bz2 rawr-ebooks-73821856c9648d030f4d148d2bc50f07f43ad369.zip