diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-06 18:58:02 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-06 18:58:02 -0500 |
commit | ba73e189ec5dcd41f6c85e9d52eacae4da0c949e (patch) | |
tree | 3fea9a8a563e2dd178ee730058f3858b3a69cf7a | |
parent | 814858d7a30fbbe8aa16e16c2297bca47497e754 (diff) | |
download | rawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.tar.gz rawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.tar.bz2 rawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.zip |
Changed how kgram cutting works
Whereas cutting occurred randomly before, now a token will be cut from the search kgram whenever the previously generated token was guaranteed by its search kgram (that is, it was the only token that could follow that specific query).
-rw-r--r-- | kgramstats.cpp | 26 |
1 files changed, 9 insertions, 17 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index e6048d9..da8c326 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -423,7 +423,7 @@ std::string kgramstats::randomSentence(int n) | |||
423 | { | 423 | { |
424 | std::string result; | 424 | std::string result; |
425 | kgram cur(1, wildcardQuery); | 425 | kgram cur(1, wildcardQuery); |
426 | int cuts = 0; | 426 | bool cut = false; |
427 | std::stack<parentype> open_delimiters; | 427 | std::stack<parentype> open_delimiters; |
428 | 428 | ||
429 | for (int i=0; i<n; i++) | 429 | for (int i=0; i<n; i++) |
@@ -433,23 +433,10 @@ std::string kgramstats::randomSentence(int n) | |||
433 | cur.pop_front(); | 433 | cur.pop_front(); |
434 | } | 434 | } |
435 | 435 | ||
436 | if (cur.size() > 0) | 436 | if ((cur.size() > 0) && cut) |
437 | { | 437 | { |
438 | if (rand() % (maxK - cur.size() + 1) == 0) | 438 | cur.pop_front(); |
439 | { | 439 | cut = false; |
440 | while (cur.size() > 2) | ||
441 | { | ||
442 | if ((rand() % (n)) < cuts) | ||
443 | { | ||
444 | cur.pop_front(); | ||
445 | cuts--; | ||
446 | } else { | ||
447 | break; | ||
448 | } | ||
449 | } | ||
450 | } | ||
451 | |||
452 | cuts++; | ||
453 | } | 440 | } |
454 | 441 | ||
455 | // Gotta circumvent the last line of the input corpus | 442 | // Gotta circumvent the last line of the input corpus |
@@ -569,6 +556,11 @@ std::string kgramstats::randomSentence(int n) | |||
569 | { | 556 | { |
570 | break; | 557 | break; |
571 | } | 558 | } |
559 | |||
560 | if (next.all == max) | ||
561 | { | ||
562 | cut = true; | ||
563 | } | ||
572 | } | 564 | } |
573 | 565 | ||
574 | // Remove the trailing space | 566 | // Remove the trailing space |