From ba73e189ec5dcd41f6c85e9d52eacae4da0c949e Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sat, 6 Feb 2016 18:58:02 -0500 Subject: Changed how kgram cutting works Whereas cutting occurred randomly before, now a token will be cut from the search kgram whenever the previously generated token was guaranteed by its search kgram (that is, it was the only token that could follow that specific query). --- kgramstats.cpp | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index e6048d9..da8c326 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -423,7 +423,7 @@ std::string kgramstats::randomSentence(int n) { std::string result; kgram cur(1, wildcardQuery); - int cuts = 0; + bool cut = false; std::stack open_delimiters; for (int i=0; i 0) + if ((cur.size() > 0) && cut) { - if (rand() % (maxK - cur.size() + 1) == 0) - { - while (cur.size() > 2) - { - if ((rand() % (n)) < cuts) - { - cur.pop_front(); - cuts--; - } else { - break; - } - } - } - - cuts++; + cur.pop_front(); + cut = false; } // Gotta circumvent the last line of the input corpus @@ -569,6 +556,11 @@ std::string kgramstats::randomSentence(int n) { break; } + + if (next.all == max) + { + cut = true; + } } // Remove the trailing space -- cgit 1.4.1