summary refs log tree commit diff stats
path: root/sentence.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'sentence.cpp')
-rw-r--r--sentence.cpp754
1 files changed, 754 insertions, 0 deletions
diff --git a/sentence.cpp b/sentence.cpp new file mode 100644 index 0000000..421aaf6 --- /dev/null +++ b/sentence.cpp
@@ -0,0 +1,754 @@
1#include "sentence.h"
2#include <algorithm>
3#include <list>
4#include <set>
5
6sentence::sentence(
7 const verbly::database& database,
8 std::mt19937& rng) :
9 database_(database),
10 rng_(rng)
11{
12}
13
14std::string sentence::generate() const
15{
16 // Generate the form that the title should take.
17 verbly::token form;
18 std::set<std::string> synrestrs {"infinitive_phrase", "bare", "subjectless"};
19 std::set<std::string> secondSyn {"participle_phrase", "subjectless"};
20 std::set<std::string> adjSyn {"adjective_phrase"};
21
22 if (std::bernoulli_distribution(1.0/6.0)(rng_))
23 {
24 form << "not";
25 }
26
27 if (std::bernoulli_distribution(1.0/6.0)(rng_))
28 {
29 form << "be";
30 form << adjSyn;
31 } else {
32 if (std::bernoulli_distribution(1.0/6.0)(rng_))
33 {
34 form << "get";
35 synrestrs.insert("experiencer");
36 synrestrs.insert("past_participle");
37 }
38
39 form << synrestrs;
40 }
41
42 if (std::bernoulli_distribution(1.0/5.0)(rng_))
43 {
44 if (std::bernoulli_distribution(1.0/4.0)(rng_))
45 {
46 form << "without";
47 } else {
48 form << "while";
49 }
50
51 form << secondSyn;
52 }
53
54 // Attempt to compile the form, restarting if a bad word is generated.
55 std::set<std::string> badWords = {"raped"};
56
57 verbly::token tok = form;
58 std::list<std::string> words;
59 for (;;)
60 {
61 // Compile the form.
62 while (!tok.isComplete())
63 {
64 visit(tok);
65 }
66
67 std::string compiled = tok.compile();
68 words = verbly::split<std::list<std::string>>(compiled, " ");
69
70 // Ensure that there are no bad words in the output.
71 if (!std::any_of(std::begin(words), std::end(words), [&badWords] (const std::string& word) {
72 std::string canonWord;
73
74 for (char ch : word)
75 {
76 if (std::isalpha(ch))
77 {
78 canonWord.push_back(std::tolower(ch));
79 }
80 }
81
82 return (badWords.count(canonWord) == 1);
83 })) {
84 break;
85 } else {
86 std::cout << "Bad word generated." << std::endl;
87 }
88 }
89
90 // Put the form into title case.
91 for (std::string& word : words)
92 {
93 if ((word[0] == '"') && (word.length() > 1))
94 {
95 word[1] = std::toupper(word[1]);
96 } else {
97 word[0] = std::toupper(word[0]);
98 }
99 }
100
101 return verbly::implode(std::begin(words), std::end(words), " ");
102}
103
104verbly::filter sentence::parseSelrestrs(
105 verbly::selrestr selrestr) const
106{
107 switch (selrestr.getType())
108 {
109 case verbly::selrestr::type::empty:
110 {
111 return {};
112 }
113
114 case verbly::selrestr::type::singleton:
115 {
116 verbly::filter result;
117
118 if (selrestr.getRestriction() == "concrete")
119 {
120 result = (verbly::notion::wnid == 100001930); // physical entity
121 } else if (selrestr.getRestriction() == "time")
122 {
123 result = (verbly::notion::wnid == 100028270); // time
124 } else if (selrestr.getRestriction() == "state")
125 {
126 result = (verbly::notion::wnid == 100024720); // state
127 } else if (selrestr.getRestriction() == "abstract")
128 {
129 result = (verbly::notion::wnid == 100002137); // abstract entity
130 } else if (selrestr.getRestriction() == "scalar")
131 {
132 result = (verbly::notion::wnid == 103835412); // number
133 } else if (selrestr.getRestriction() == "currency")
134 {
135 result = (verbly::notion::wnid == 105050379); // currency
136 } else if (selrestr.getRestriction() == "location")
137 {
138 result = (verbly::notion::wnid == 100027167); // location
139 } else if (selrestr.getRestriction() == "organization")
140 {
141 result = (verbly::notion::wnid == 100237078); // organization
142 } else if (selrestr.getRestriction() == "int_control")
143 {
144 result = (verbly::notion::wnid == 100007347); // causal agent
145 } else if (selrestr.getRestriction() == "natural")
146 {
147 result = (verbly::notion::wnid == 100019128); // natural object
148 } else if (selrestr.getRestriction() == "phys_obj")
149 {
150 result = (verbly::notion::wnid == 100002684); // physical object
151 } else if (selrestr.getRestriction() == "solid")
152 {
153 result = (verbly::notion::wnid == 113860793); // solid
154 } else if (selrestr.getRestriction() == "shape")
155 {
156 result = (verbly::notion::wnid == 100027807); // shape
157 } else if (selrestr.getRestriction() == "substance")
158 {
159 result = (verbly::notion::wnid == 100019613); // substance
160 } else if (selrestr.getRestriction() == "idea")
161 {
162 result = (verbly::notion::wnid == 105803379); // idea
163 } else if (selrestr.getRestriction() == "sound")
164 {
165 result = (verbly::notion::wnid == 107111047); // sound
166 } else if (selrestr.getRestriction() == "communication")
167 {
168 result = (verbly::notion::wnid == 100033020); // communication
169 } else if (selrestr.getRestriction() == "region")
170 {
171 result = (verbly::notion::wnid == 105221895); // region
172 } else if (selrestr.getRestriction() == "place")
173 {
174 result = (verbly::notion::wnid == 100586262); // place
175 } else if (selrestr.getRestriction() == "machine")
176 {
177 result = (verbly::notion::wnid == 102958343); // machine
178 } else if (selrestr.getRestriction() == "animate")
179 {
180 result = (verbly::notion::wnid == 100004258); // animate thing
181 } else if (selrestr.getRestriction() == "plant")
182 {
183 result = (verbly::notion::wnid == 103956922); // plant
184 } else if (selrestr.getRestriction() == "comestible")
185 {
186 result = (verbly::notion::wnid == 100021265); // food
187 } else if (selrestr.getRestriction() == "artifact")
188 {
189 result = (verbly::notion::wnid == 100021939); // artifact
190 } else if (selrestr.getRestriction() == "vehicle")
191 {
192 result = (verbly::notion::wnid == 104524313); // vehicle
193 } else if (selrestr.getRestriction() == "human")
194 {
195 result = (verbly::notion::wnid == 100007846); // person
196 } else if (selrestr.getRestriction() == "animal")
197 {
198 result = (verbly::notion::wnid == 100015388); // animal
199 } else if (selrestr.getRestriction() == "body_part")
200 {
201 result = (verbly::notion::wnid == 105220461); // body part
202 } else if (selrestr.getRestriction() == "garment")
203 {
204 result = (verbly::notion::wnid == 103051540); // clothing
205 } else if (selrestr.getRestriction() == "tool")
206 {
207 result = (verbly::notion::wnid == 104451818); // tool
208 } else {
209 return {};
210 }
211
212 std::cout << selrestr.getRestriction() << " (" << selrestr.getPos() << ")" << std::endl;
213
214 if (selrestr.getPos())
215 {
216 return (verbly::notion::fullHypernyms %= result);
217 } else {
218 return !(verbly::notion::fullHypernyms %= result);
219 }
220 }
221
222 case verbly::selrestr::type::group:
223 {
224 std::cout << "or: " << selrestr.getOrlogic() << std::endl;
225 verbly::filter ret(selrestr.getOrlogic());
226
227 for (const verbly::selrestr& child : selrestr)
228 {
229 ret += parseSelrestrs(child);
230 }
231
232 return ret;
233 }
234 }
235}
236
237bool sentence::requiresSelrestr(
238 std::string restriction,
239 verbly::selrestr selrestr) const
240{
241 switch (selrestr.getType())
242 {
243 case verbly::selrestr::type::empty:
244 {
245 return false;
246 }
247
248 case verbly::selrestr::type::singleton:
249 {
250 if (selrestr.getRestriction() == restriction)
251 {
252 return selrestr.getPos();
253 } else {
254 return false;
255 }
256 }
257
258 case verbly::selrestr::type::group:
259 {
260 if (selrestr.getOrlogic())
261 {
262 return std::all_of(std::begin(selrestr), std::end(selrestr), [=] (const verbly::selrestr& s) {
263 return requiresSelrestr(restriction, s);
264 });
265 } else {
266 return std::any_of(std::begin(selrestr), std::end(selrestr), [=] (const verbly::selrestr& s) {
267 return requiresSelrestr(restriction, s);
268 });
269 }
270 }
271 }
272}
273
274verbly::word sentence::generateStandardNoun(
275 std::string role,
276 verbly::selrestr selrestrs) const
277{
278 std::geometric_distribution<int> tagdist(0.5); // 0.06
279 std::vector<verbly::word> result;
280 bool trySelection = true;
281
282 while (result.empty())
283 {
284 verbly::filter condition =
285 (verbly::notion::partOfSpeech == verbly::part_of_speech::noun)
286 && (verbly::form::proper == false)
287 //&& (verbly::form::complexity == 1)
288 // && (verbly::word::tagCount >= tagdist(rng_)) // Favor more common words
289 && (verbly::word::tagCount >= 1)
290 && !(verbly::word::usageDomains %= (verbly::notion::wnid == 106718862)); // Blacklist ethnic slurs
291
292 // Only use selection restrictions for a first attempt.
293 if (trySelection)
294 {
295 verbly::filter selrestrCondition = parseSelrestrs(selrestrs).compact();
296
297 if (selrestrCondition.getType() != verbly::filter::type::empty)
298 {
299 condition &= std::move(selrestrCondition);
300 } else if (role == "Attribute")
301 {
302 condition &= (verbly::notion::fullHypernyms %= (verbly::notion::wnid == 100024264)); // attribute
303 } else if (role == "Instrument")
304 {
305 condition &= (verbly::notion::fullHypernyms %= (verbly::notion::wnid == 104451818)); // tool
306 } else if (role == "Agent")
307 {
308 condition &= (verbly::notion::fullHypernyms %= (verbly::notion::wnid == 100007347)); // causal agent
309 }
310
311 trySelection = false;
312 } else {
313 std::cout << "Selection failed" << std::endl;
314 }
315
316 result = database_.words(condition).all();
317 }
318
319 return result.front();
320}
321
322verbly::token sentence::generateStandardNounPhrase(
323 const verbly::word& noun,
324 std::string role,
325 bool plural,
326 bool definite) const
327{
328 verbly::token utter;
329 verbly::word sounder = noun;
330 verbly::word descript;
331
332 if (std::bernoulli_distribution(1.0/8.0)(rng_))
333 {
334 std::geometric_distribution<int> tagdist(0.2);
335 descript = database_.words(
336 (verbly::word::tagCount >= tagdist(rng_))
337 && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective)).first();
338
339 sounder = descript;
340 }
341
342 if ((std::bernoulli_distribution(1.0/3.0)(rng_)) && (definite))
343 {
344 utter << "the";
345
346 if (std::bernoulli_distribution(1.0/2.0)(rng_))
347 {
348 plural = true;
349 }
350 } else {
351 if ((role != "Theme") && (role != "Attribute") && std::bernoulli_distribution(1.0/2.0)(rng_))
352 {
353 utter << "your";
354 } else if (!plural) {
355 if (sounder.getLemma().getBaseForm().startsWithVowelSound())
356 {
357 utter << "an";
358 } else {
359 utter << "a";
360 }
361 }
362 }
363
364 if (descript)
365 {
366 utter << descript;
367 }
368
369 if (plural && noun.getLemma().hasInflection(verbly::inflection::plural))
370 {
371 utter << verbly::token(noun, verbly::inflection::plural);
372 } else {
373 utter << noun;
374 }
375
376 return utter;
377}
378
379verbly::token sentence::generateClause(
380 const verbly::token& it) const
381{
382 verbly::token utter;
383 std::geometric_distribution<int> tagdist(0.07);
384 std::vector<verbly::word> verbDataset;
385
386 verbly::filter frameCondition =
387 (verbly::frame::length >= 2)
388 && (verbly::frame::part(0) %= (
389 (verbly::part::type == verbly::part_type::noun_phrase)
390 && (verbly::part::role == "Agent"))
391 && !(verbly::frame::part() %= (
392 verbly::part::synrestr %= "adjp")));
393
394 if (it.hasSynrestr("experiencer"))
395 {
396 frameCondition &=
397 (verbly::frame::part(2) %=
398 (verbly::part::type == verbly::part_type::noun_phrase)
399 && !(verbly::part::synrestr %= "genitive")
400 && ((verbly::part::role == "Patient")
401 || (verbly::part::role == "Experiencer")));
402 }
403
404 verbly::filter verbCondition =
405 (verbly::notion::partOfSpeech == verbly::part_of_speech::verb)
406 && frameCondition;
407
408 if (it.hasSynrestr("participle_phrase"))
409 {
410 verbCondition &= (verbly::lemma::form(verbly::inflection::ing_form));
411 } else if (it.hasSynrestr("progressive"))
412 {
413 verbCondition &= (verbly::lemma::form(verbly::inflection::s_form));
414 } else if (it.hasSynrestr("past_participle"))
415 {
416 verbCondition &= (verbly::lemma::form(verbly::inflection::past_participle));
417 }
418
419 // Because of the tag distribution, it's possible (albeit extremely unlikely)
420 // for the verb query to fail, so we loop until it succeeds.
421 while (verbDataset.empty())
422 {
423 verbDataset = database_.words(
424 verbCondition
425 && (verbly::word::tagCount >= tagdist(rng_))
426 ).all();
427 }
428
429 verbly::word verb = verbDataset.front();
430 verbly::frame frame = database_.frames(frameCondition && verb).first();
431 std::list<verbly::part> parts(std::begin(frame.getParts()), std::end(frame.getParts()));
432
433 if (it.hasSynrestr("experiencer"))
434 {
435 // Ignore the direct object.
436 parts.erase(std::next(parts.begin(), 2));
437 }
438
439 if (it.hasSynrestr("subjectless"))
440 {
441 // Ignore the subject.
442 parts.pop_front();
443 }
444
445 for (const verbly::part& part : parts)
446 {
447 switch (part.getType())
448 {
449 case verbly::part_type::noun_phrase:
450 {
451 std::cout << "NP: ";
452 for (auto& s : part.getNounSynrestrs())
453 {
454 std::cout << s << " ";
455 }
456 std::cout << std::endl;
457
458 if (requiresSelrestr("currency", part.getNounSelrestrs()))
459 {
460 int lead = std::uniform_int_distribution<int>(1,9)(rng_);
461 int tail = std::uniform_int_distribution<int>(0,6)(rng_);
462 std::string tailStr(tail, '0');
463
464 utter << ("$" + std::to_string(lead) + tailStr);
465 } else if (part.nounHasSynrestr("adjp"))
466 {
467 utter << std::set<std::string>({"adjective_phrase"});
468 } else if ((part.nounHasSynrestr("be_sc_ing"))
469 || (part.nounHasSynrestr("ac_ing"))
470 || (part.nounHasSynrestr("sc_ing"))
471 || (part.nounHasSynrestr("np_omit_ing"))
472 || (part.nounHasSynrestr("oc_ing")))
473 {
474 utter << std::set<std::string>({"participle_phrase", "subjectless"});
475 } else if ((part.nounHasSynrestr("poss_ing"))
476 || (part.nounHasSynrestr("possing"))
477 || (part.nounHasSynrestr("pos_ing")))
478 {
479 utter << "your";
480 utter << std::set<std::string>({"participle_phrase", "subjectless"});
481 } else if (part.nounHasSynrestr("genitive"))
482 {
483 utter << "your";
484 } else if (part.nounHasSynrestr("adv_loc"))
485 {
486 if (std::bernoulli_distribution(1.0/2.0)(rng_))
487 {
488 utter << "here";
489 } else {
490 utter << "there";
491 }
492 } else if (part.nounHasSynrestr("refl"))
493 {
494 utter << "yourself";
495 } else if ((part.nounHasSynrestr("sc_to_inf"))
496 || (part.nounHasSynrestr("ac_to_inf"))
497 || (part.nounHasSynrestr("vc_to_inf"))
498 || (part.nounHasSynrestr("rs_to_inf"))
499 || (part.nounHasSynrestr("oc_to_inf")))
500 {
501 utter << std::set<std::string>({"infinitive_phrase", "subjectless"});
502 } else if (part.nounHasSynrestr("oc_bare_inf"))
503 {
504 utter << std::set<std::string>({"infinitive_phrase", "bare", "subjectless"});
505 } else if (part.nounHasSynrestr("wh_comp"))
506 {
507 utter << "whether";
508
509 verbly::token sentence(std::set<std::string>({"progressive"}));
510 utter << generateClause(sentence);
511 } else if (part.nounHasSynrestr("that_comp"))
512 {
513 utter << "that";
514 utter << "they";
515
516 verbly::token sentence(std::set<std::string>({"subjectless"}));
517 utter << generateClause(sentence);
518 } else if (part.nounHasSynrestr("what_extract"))
519 {
520 utter << "what";
521
522 verbly::token sentence(std::set<std::string>({"progressive", "experiencer"}));
523 utter << generateClause(sentence);
524 } else if (part.nounHasSynrestr("how_extract"))
525 {
526 utter << "how";
527
528 verbly::token sentence(std::set<std::string>({"progressive"}));
529 utter << generateClause(sentence);
530 } else if (part.nounHasSynrestr("wh_inf"))
531 {
532 utter << "how";
533
534 verbly::token sentence(std::set<std::string>({"infinitive_phrase", "subjectless"}));
535 utter << generateClause(sentence);
536 } else if (part.nounHasSynrestr("what_inf"))
537 {
538 utter << "what";
539
540 verbly::token sentence(std::set<std::string>({"infinitive_phrase", "subjectless", "experiencer"}));
541 utter << generateClause(sentence);
542 } else if (part.nounHasSynrestr("wheth_inf"))
543 {
544 utter << "whether";
545
546 verbly::token sentence(std::set<std::string>({"infinitive_phrase", "subjectless"}));
547 utter << generateClause(sentence);
548 } else if (part.nounHasSynrestr("quotation"))
549 {
550 verbly::token sentence(std::set<std::string>({"participle_phrase"}));
551 while (!sentence.isComplete())
552 {
553 visit(sentence);
554 }
555
556 utter << ("\"" + sentence.compile() + "\"");
557 } else {
558 verbly::word noun = generateStandardNoun(part.getNounRole(), part.getNounSelrestrs());
559
560 bool plural = part.nounHasSynrestr("plural");
561 if (!plural)
562 {
563 plural = requiresSelrestr("plural", part.getNounSelrestrs());
564 }
565
566 utter << generateStandardNounPhrase(
567 noun,
568 part.getNounRole(),
569 plural,
570 part.nounHasSynrestr("definite"));
571
572 if (part.nounHasSynrestr("acc_ing") || part.nounHasSynrestr("ac_ing"))
573 {
574 utter << std::set<std::string>({"participle_phrase", "subjectless"});
575 }
576 }
577
578 break;
579 }
580
581 case verbly::part_type::verb:
582 {
583 std::cout << "V: " << verb.getBaseForm() << std::endl;
584
585 if (it.hasSynrestr("progressive"))
586 {
587 utter << verbly::token(verb, verbly::inflection::s_form);
588 } else if (it.hasSynrestr("past_participle"))
589 {
590 utter << verbly::token(verb, verbly::inflection::past_participle);
591 } else if (it.hasSynrestr("infinitive_phrase"))
592 {
593 if (!it.hasSynrestr("bare"))
594 {
595 utter << "to";
596 }
597
598 utter << verb;
599 } else if (it.hasSynrestr("participle_phrase"))
600 {
601 utter << verbly::token(verb, verbly::inflection::ing_form);
602 } else {
603 utter << verb;
604 }
605
606 break;
607 }
608
609 case verbly::part_type::preposition:
610 {
611 std::cout << "PREP" << std::endl;
612
613 if (part.isPrepositionLiteral())
614 {
615 int choiceIndex = std::uniform_int_distribution<int>(0, part.getPrepositionChoices().size()-1)(rng_);
616 utter << part.getPrepositionChoices()[choiceIndex];
617 } else {
618 verbly::filter pgf(true);
619 for (const std::string& choice : part.getPrepositionChoices())
620 {
621 pgf += (verbly::notion::prepositionGroup == choice);
622 }
623
624 utter << database_.words(pgf && (verbly::notion::partOfSpeech == verbly::part_of_speech::preposition)).first();
625 }
626
627 break;
628 }
629
630 case verbly::part_type::adjective:
631 {
632 std::cout << "ADJ" << std::endl;
633
634 utter << std::set<std::string>({"adjective_phrase"});
635
636 break;
637 }
638
639 case verbly::part_type::adverb:
640 {
641 std::cout << "ADV" << std::endl;
642
643 utter << std::set<std::string>({"adverb_phrase"});
644
645 break;
646 }
647
648 case verbly::part_type::literal:
649 {
650 std::cout << "LIT" << std::endl;
651
652 utter << part.getLiteralValue();
653
654 break;
655 }
656
657 case verbly::part_type::invalid:
658 {
659 // Nope
660
661 break;
662 }
663 }
664 }
665
666 if ((parts.size() == 1) && (std::bernoulli_distribution(1.0/4.0)(rng_)))
667 {
668 utter << std::set<std::string>({"adverb_phrase"});
669 }
670
671 return utter;
672}
673
674void sentence::visit(verbly::token& it) const
675{
676 switch (it.getType())
677 {
678 case verbly::token::type::utterance:
679 {
680 for (verbly::token& token : it)
681 {
682 if (!token.isComplete())
683 {
684 visit(token);
685
686 break;
687 }
688 }
689
690 break;
691 }
692
693 case verbly::token::type::fillin:
694 {
695 if (it.hasSynrestr("infinitive_phrase"))
696 {
697 it = generateClause(it);
698 } else if (it.hasSynrestr("adjective_phrase"))
699 {
700 verbly::token phrase;
701
702 if (std::bernoulli_distribution(1.0/6.0)(rng_))
703 {
704 phrase << std::set<std::string>({"adverb_phrase"});
705 }
706
707 if (std::bernoulli_distribution(1.0/4.0)(rng_))
708 {
709 phrase << std::set<std::string>({"participle_phrase", "subjectless"});
710 } else {
711 std::geometric_distribution<int> tagdist(0.2);
712 phrase << database_.words(
713 (verbly::word::tagCount >= tagdist(rng_))
714 && (verbly::notion::partOfSpeech == verbly::part_of_speech::adjective)).first();
715 }
716
717 it = phrase;
718 } else if (it.hasSynrestr("adverb_phrase"))
719 {
720 std::geometric_distribution<int> tagdist(1.0/23.0);
721
722 it = database_.words(
723 (verbly::notion::partOfSpeech == verbly::part_of_speech::adverb)
724 && (verbly::word::tagCount >= tagdist(rng_))
725 ).first();
726 } else if (it.hasSynrestr("participle_phrase"))
727 {
728 if (std::bernoulli_distribution(1.0/2.0)(rng_))
729 {
730 it = verbly::token(
731 database_.words(
732 (verbly::notion::partOfSpeech == verbly::part_of_speech::verb)
733 && (verbly::lemma::form(verbly::inflection::ing_form))).first(),
734 verbly::inflection::ing_form);
735 } else {
736 it = generateClause(it);
737 }
738 } else {
739 it = "*the reality of the situation*";
740 }
741
742 break;
743 }
744
745 case verbly::token::type::word:
746 case verbly::token::type::literal:
747 case verbly::token::type::part:
748 {
749 // Nope
750
751 break;
752 }
753 }
754}