about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-06 18:58:02 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-06 18:58:02 -0500
commitba73e189ec5dcd41f6c85e9d52eacae4da0c949e (patch)
tree3fea9a8a563e2dd178ee730058f3858b3a69cf7a /kgramstats.cpp
parent814858d7a30fbbe8aa16e16c2297bca47497e754 (diff)
downloadrawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.tar.gz
rawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.tar.bz2
rawr-ebooks-ba73e189ec5dcd41f6c85e9d52eacae4da0c949e.zip
Changed how kgram cutting works
Whereas cutting occurred randomly before, now a token will be cut from the search kgram whenever the previously generated token was guaranteed by its search kgram (that is, it was the only token that could follow that specific query).
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp26
1 files changed, 9 insertions, 17 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index e6048d9..da8c326 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -423,7 +423,7 @@ std::string kgramstats::randomSentence(int n)
423{ 423{
424 std::string result; 424 std::string result;
425 kgram cur(1, wildcardQuery); 425 kgram cur(1, wildcardQuery);
426 int cuts = 0; 426 bool cut = false;
427 std::stack<parentype> open_delimiters; 427 std::stack<parentype> open_delimiters;
428 428
429 for (int i=0; i<n; i++) 429 for (int i=0; i<n; i++)
@@ -433,23 +433,10 @@ std::string kgramstats::randomSentence(int n)
433 cur.pop_front(); 433 cur.pop_front();
434 } 434 }
435 435
436 if (cur.size() > 0) 436 if ((cur.size() > 0) && cut)
437 { 437 {
438 if (rand() % (maxK - cur.size() + 1) == 0) 438 cur.pop_front();
439 { 439 cut = false;
440 while (cur.size() > 2)
441 {
442 if ((rand() % (n)) < cuts)
443 {
444 cur.pop_front();
445 cuts--;
446 } else {
447 break;
448 }
449 }
450 }
451
452 cuts++;
453 } 440 }
454 441
455 // Gotta circumvent the last line of the input corpus 442 // Gotta circumvent the last line of the input corpus
@@ -569,6 +556,11 @@ std::string kgramstats::randomSentence(int n)
569 { 556 {
570 break; 557 break;
571 } 558 }
559
560 if (next.all == max)
561 {
562 cut = true;
563 }
572 } 564 }
573 565
574 // Remove the trailing space 566 // Remove the trailing space