diff options
Diffstat (limited to 'sentence.cpp')
| -rw-r--r-- | sentence.cpp | 139 |
1 files changed, 59 insertions, 80 deletions
| diff --git a/sentence.cpp b/sentence.cpp index 5f1b03a..a9b40c2 100644 --- a/sentence.cpp +++ b/sentence.cpp | |||
| @@ -9,6 +9,19 @@ sentence::sentence( | |||
| 9 | database_(database), | 9 | database_(database), |
| 10 | rng_(rng) | 10 | rng_(rng) |
| 11 | { | 11 | { |
| 12 | verbly::filter blacklist; | ||
| 13 | |||
| 14 | for (std::string word : { | ||
| 15 | "raped", "Negro" | ||
| 16 | }) | ||
| 17 | { | ||
| 18 | blacklist |= (verbly::form::text == word); | ||
| 19 | } | ||
| 20 | |||
| 21 | badWords_ = !blacklist; | ||
| 22 | |||
| 23 | // Blacklist ethnic slurs | ||
| 24 | badWords_ &= !(verbly::word::usageDomains %= (verbly::notion::wnid == 106718862)); | ||
| 12 | } | 25 | } |
| 13 | 26 | ||
| 14 | std::string sentence::generate() const | 27 | std::string sentence::generate() const |
| @@ -51,54 +64,18 @@ std::string sentence::generate() const | |||
| 51 | form << secondSyn; | 64 | form << secondSyn; |
| 52 | } | 65 | } |
| 53 | 66 | ||
| 54 | // Attempt to compile the form, restarting if a bad word is generated. | 67 | // Compile the form. |
| 55 | std::set<std::string> badWords = {"raped"}; | 68 | verbly::token tok = verbly::token::capitalize( |
| 69 | verbly::token::casing::title_case, form); | ||
| 56 | 70 | ||
| 57 | verbly::token tok = form; | 71 | while (!tok.isComplete()) |
| 58 | std::list<std::string> words; | ||
| 59 | for (;;) | ||
| 60 | { | 72 | { |
| 61 | // Compile the form. | 73 | visit(tok); |
| 62 | while (!tok.isComplete()) | ||
| 63 | { | ||
| 64 | visit(tok); | ||
| 65 | } | ||
| 66 | |||
| 67 | std::string compiled = tok.compile(); | ||
| 68 | words = verbly::split<std::list<std::string>>(compiled, " "); | ||
| 69 | |||
| 70 | // Ensure that there are no bad words in the output. | ||
| 71 | if (!std::any_of(std::begin(words), std::end(words), [&badWords] (const std::string& word) { | ||
| 72 | std::string canonWord; | ||
| 73 | |||
| 74 | for (char ch : word) | ||
| 75 | { | ||
| 76 | if (std::isalpha(ch)) | ||
| 77 | { | ||
| 78 | canonWord.push_back(std::tolower(ch)); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | |||
| 82 | return (badWords.count(canonWord) == 1); | ||
| 83 | })) { | ||
| 84 | break; | ||
| 85 | } else { | ||
| 86 | std::cout << "Bad word generated." << std::endl; | ||
| 87 | } | ||
| 88 | } | 74 | } |
| 89 | 75 | ||
| 90 | // Put the form into title case. | 76 | std::string compiled = tok.compile(); |
| 91 | for (std::string& word : words) | ||
| 92 | { | ||
| 93 | if ((word[0] == '"') && (word.length() > 1)) | ||
| 94 | { | ||
| 95 | word[1] = std::toupper(word[1]); | ||
| 96 | } else { | ||
| 97 | word[0] = std::toupper(word[0]); | ||
| 98 | } | ||
| 99 | } | ||
| 100 | 77 | ||
| 101 | return verbly::implode(std::begin(words), std::end(words), " "); | 78 | return compiled; |
| 102 | } | 79 | } |
| 103 | 80 | ||
| 104 | bool sentence::chooseSelrestr(std::set<std::string> selrestrs, std::set<std::string> choices) const | 81 | bool sentence::chooseSelrestr(std::set<std::string> selrestrs, std::set<std::string> choices) const |
| @@ -111,7 +88,7 @@ bool sentence::chooseSelrestr(std::set<std::string> selrestrs, std::set<std::str | |||
| 111 | validChoices++; | 88 | validChoices++; |
| 112 | } | 89 | } |
| 113 | } | 90 | } |
| 114 | 91 | ||
| 115 | return std::bernoulli_distribution(static_cast<double>(validChoices)/static_cast<double>(selrestrs.size()))(rng_); | 92 | return std::bernoulli_distribution(static_cast<double>(validChoices)/static_cast<double>(selrestrs.size()))(rng_); |
| 116 | } | 93 | } |
| 117 | 94 | ||
| @@ -131,7 +108,7 @@ verbly::word sentence::generateStandardNoun( | |||
| 131 | //&& (verbly::form::complexity == 1) | 108 | //&& (verbly::form::complexity == 1) |
| 132 | // && (verbly::word::tagCount >= tagdist(rng_)) // Favor more common words | 109 | // && (verbly::word::tagCount >= tagdist(rng_)) // Favor more common words |
| 133 | && (verbly::word::tagCount >= 1) | 110 | && (verbly::word::tagCount >= 1) |
| 134 | && !(verbly::word::usageDomains %= (verbly::notion::wnid == 106718862)); // Blacklist ethnic slurs | 111 | && badWords_; |
| 135 | 112 | ||
| 136 | // Only use selection restrictions for a first attempt. | 113 | // Only use selection restrictions for a first attempt. |
| 137 | if (trySelection) | 114 | if (trySelection) |
| @@ -248,7 +225,7 @@ verbly::word sentence::generateStandardNoun( | |||
| 248 | selection += (verbly::notion::wnid == 103670849); // line | 225 | selection += (verbly::notion::wnid == 103670849); // line |
| 249 | } | 226 | } |
| 250 | } | 227 | } |
| 251 | 228 | ||
| 252 | if (selection.compact().getType() != verbly::filter::type::empty) | 229 | if (selection.compact().getType() != verbly::filter::type::empty) |
| 253 | { | 230 | { |
| 254 | condition &= (verbly::notion::fullHypernyms %= std::move(selection)); | 231 | condition &= (verbly::notion::fullHypernyms %= std::move(selection)); |
| @@ -281,18 +258,7 @@ verbly::token sentence::generateStandardNounPhrase( | |||
| 281 | bool definite) const | 258 | bool definite) const |
| 282 | { | 259 | { |
| 283 | verbly::token utter; | 260 | verbly::token utter; |
| 284 | verbly::word sounder = noun; | 261 | bool indefiniteArticle = false; |
| 285 | verbly::word descript; | ||
| 286 | |||
| 287 | if (std::bernoulli_distribution(1.0/8.0)(rng_)) | ||
| 288 | { | ||
| 289 | std::geometric_distribution<int> tagdist(0.2); | ||
| 290 | descript = database_.words( | ||
| 291 | (verbly::word::tagCount >= tagdist(rng_)) | ||
| 292 | && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective)).first(); | ||
| 293 | |||
| 294 | sounder = descript; | ||
| 295 | } | ||
| 296 | 262 | ||
| 297 | if ((std::bernoulli_distribution(1.0/3.0)(rng_)) && (definite)) | 263 | if ((std::bernoulli_distribution(1.0/3.0)(rng_)) && (definite)) |
| 298 | { | 264 | { |
| @@ -307,18 +273,18 @@ verbly::token sentence::generateStandardNounPhrase( | |||
| 307 | { | 273 | { |
| 308 | utter << "your"; | 274 | utter << "your"; |
| 309 | } else if (!plural) { | 275 | } else if (!plural) { |
| 310 | if (sounder.getBaseForm().startsWithVowelSound()) | 276 | indefiniteArticle = true; |
| 311 | { | ||
| 312 | utter << "an"; | ||
| 313 | } else { | ||
| 314 | utter << "a"; | ||
| 315 | } | ||
| 316 | } | 277 | } |
| 317 | } | 278 | } |
| 318 | 279 | ||
| 319 | if (descript.isValid()) | 280 | if (std::bernoulli_distribution(1.0/8.0)(rng_)) |
| 320 | { | 281 | { |
| 321 | utter << descript; | 282 | std::geometric_distribution<int> tagdist(0.2); |
| 283 | |||
| 284 | utter << database_.words( | ||
| 285 | (verbly::word::tagCount >= tagdist(rng_)) | ||
| 286 | && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective) | ||
| 287 | && badWords_).first(); | ||
| 322 | } | 288 | } |
| 323 | 289 | ||
| 324 | if (plural && noun.hasInflection(verbly::inflection::plural)) | 290 | if (plural && noun.hasInflection(verbly::inflection::plural)) |
| @@ -328,7 +294,12 @@ verbly::token sentence::generateStandardNounPhrase( | |||
| 328 | utter << noun; | 294 | utter << noun; |
| 329 | } | 295 | } |
| 330 | 296 | ||
| 331 | return utter; | 297 | if (indefiniteArticle) |
| 298 | { | ||
| 299 | return verbly::token::indefiniteArticle(utter); | ||
| 300 | } else { | ||
| 301 | return utter; | ||
| 302 | } | ||
| 332 | } | 303 | } |
| 333 | 304 | ||
| 334 | verbly::token sentence::generateClause( | 305 | verbly::token sentence::generateClause( |
| @@ -360,7 +331,8 @@ verbly::token sentence::generateClause( | |||
| 360 | 331 | ||
| 361 | verbly::filter verbCondition = | 332 | verbly::filter verbCondition = |
| 362 | (verbly::notion::partOfSpeech == verbly::part_of_speech::verb) | 333 | (verbly::notion::partOfSpeech == verbly::part_of_speech::verb) |
| 363 | && frameCondition; | 334 | && frameCondition |
| 335 | && badWords_; | ||
| 364 | 336 | ||
| 365 | if (it.hasSynrestr("participle_phrase")) | 337 | if (it.hasSynrestr("participle_phrase")) |
| 366 | { | 338 | { |
| @@ -501,20 +473,15 @@ verbly::token sentence::generateClause( | |||
| 501 | utter << generateClause(sentence); | 473 | utter << generateClause(sentence); |
| 502 | } else if (part.nounHasSynrestr("quotation")) | 474 | } else if (part.nounHasSynrestr("quotation")) |
| 503 | { | 475 | { |
| 504 | verbly::token sentence(std::set<std::string>({"participle_phrase"})); | 476 | utter << verbly::token::quote("\"", "\"", |
| 505 | while (!sentence.isComplete()) | 477 | verbly::token(std::set<std::string>({"past_participle"}))); |
| 506 | { | ||
| 507 | visit(sentence); | ||
| 508 | } | ||
| 509 | |||
| 510 | utter << ("\"" + sentence.compile() + "\""); | ||
| 511 | } else { | 478 | } else { |
| 512 | if (part.nounHasSynrestr("genitive")) | 479 | if (part.nounHasSynrestr("genitive")) |
| 513 | { | 480 | { |
| 514 | verbly::word noun = generateStandardNoun("Passive", {"animate"}); | 481 | verbly::word noun = generateStandardNoun("Passive", {"animate"}); |
| 515 | verbly::token owner = generateStandardNounPhrase(noun, "Passive", false, true); | 482 | verbly::token owner = generateStandardNounPhrase(noun, "Passive", false, true); |
| 516 | std::string ownerStr = owner.compile() + "'s"; | 483 | |
| 517 | utter << ownerStr; | 484 | utter << verbly::token::punctuation("'s", owner); |
| 518 | } | 485 | } |
| 519 | 486 | ||
| 520 | verbly::word noun = generateStandardNoun(part.getNounRole(), part.getNounSelrestrs()); | 487 | verbly::word noun = generateStandardNoun(part.getNounRole(), part.getNounSelrestrs()); |
| @@ -669,7 +636,8 @@ void sentence::visit(verbly::token& it) const | |||
| 669 | std::geometric_distribution<int> tagdist(0.2); | 636 | std::geometric_distribution<int> tagdist(0.2); |
| 670 | phrase << database_.words( | 637 | phrase << database_.words( |
| 671 | (verbly::word::tagCount >= tagdist(rng_)) | 638 | (verbly::word::tagCount >= tagdist(rng_)) |
| 672 | && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective)).first(); | 639 | && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective) |
| 640 | && badWords_).first(); | ||
| 673 | } | 641 | } |
| 674 | 642 | ||
| 675 | it = phrase; | 643 | it = phrase; |
| @@ -680,7 +648,7 @@ void sentence::visit(verbly::token& it) const | |||
| 680 | it = database_.words( | 648 | it = database_.words( |
| 681 | (verbly::notion::partOfSpeech == verbly::part_of_speech::adverb) | 649 | (verbly::notion::partOfSpeech == verbly::part_of_speech::adverb) |
| 682 | && (verbly::word::tagCount >= tagdist(rng_)) | 650 | && (verbly::word::tagCount >= tagdist(rng_)) |
| 683 | ).first(); | 651 | && badWords_).first(); |
| 684 | } else if (it.hasSynrestr("participle_phrase")) | 652 | } else if (it.hasSynrestr("participle_phrase")) |
| 685 | { | 653 | { |
| 686 | if (std::bernoulli_distribution(1.0/2.0)(rng_)) | 654 | if (std::bernoulli_distribution(1.0/2.0)(rng_)) |
| @@ -688,11 +656,15 @@ void sentence::visit(verbly::token& it) const | |||
| 688 | it = verbly::token( | 656 | it = verbly::token( |
| 689 | database_.words( | 657 | database_.words( |
| 690 | (verbly::notion::partOfSpeech == verbly::part_of_speech::verb) | 658 | (verbly::notion::partOfSpeech == verbly::part_of_speech::verb) |
| 691 | && (verbly::word::forms(verbly::inflection::ing_form))).first(), | 659 | && (verbly::word::forms(verbly::inflection::ing_form)) |
| 660 | && badWords_).first(), | ||
| 692 | verbly::inflection::ing_form); | 661 | verbly::inflection::ing_form); |
| 693 | } else { | 662 | } else { |
| 694 | it = generateClause(it); | 663 | it = generateClause(it); |
| 695 | } | 664 | } |
| 665 | } else if (it.hasSynrestr("past_participle")) | ||
| 666 | { | ||
| 667 | it = generateClause(it); | ||
| 696 | } else { | 668 | } else { |
| 697 | it = "*the reality of the situation*"; | 669 | it = "*the reality of the situation*"; |
| 698 | } | 670 | } |
| @@ -700,6 +672,13 @@ void sentence::visit(verbly::token& it) const | |||
| 700 | break; | 672 | break; |
| 701 | } | 673 | } |
| 702 | 674 | ||
| 675 | case verbly::token::type::transform: | ||
| 676 | { | ||
| 677 | visit(it.getInnerToken()); | ||
| 678 | |||
| 679 | break; | ||
| 680 | } | ||
| 681 | |||
| 703 | case verbly::token::type::word: | 682 | case verbly::token::type::word: |
| 704 | case verbly::token::type::literal: | 683 | case verbly::token::type::literal: |
| 705 | case verbly::token::type::part: | 684 | case verbly::token::type::part: |
