diff options
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 275 |
1 files changed, 185 insertions, 90 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index e34ca69..785ec87 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,16 +1,14 @@ | |||
1 | #include "generator.h" | 1 | #include "generator.h" |
2 | #include <cassert> | ||
3 | #include <stdexcept> | 2 | #include <stdexcept> |
4 | #include <iostream> | 3 | #include <iostream> |
5 | #include <regex> | 4 | #include <regex> |
6 | #include <dirent.h> | 5 | #include <dirent.h> |
7 | #include <fstream> | 6 | #include <fstream> |
8 | #include "../lib/enums.h" | 7 | #include <hkutil/string.h> |
9 | #include "progress.h" | 8 | #include <hkutil/progress.h> |
10 | #include "role.h" | 9 | #include "role.h" |
11 | #include "part.h" | 10 | #include "part.h" |
12 | #include "field.h" | 11 | #include "../lib/enums.h" |
13 | #include "../lib/util.h" | ||
14 | #include "../lib/version.h" | 12 | #include "../lib/version.h" |
15 | 13 | ||
16 | namespace verbly { | 14 | namespace verbly { |
@@ -28,7 +26,7 @@ namespace verbly { | |||
28 | wordNetPath_(wordNetPath), | 26 | wordNetPath_(wordNetPath), |
29 | cmudictPath_(cmudictPath), | 27 | cmudictPath_(cmudictPath), |
30 | imageNetPath_(imageNetPath), | 28 | imageNetPath_(imageNetPath), |
31 | db_(outputPath) | 29 | db_(outputPath, hatkirby::dbmode::create) |
32 | { | 30 | { |
33 | // Ensure VerbNet directory exists | 31 | // Ensure VerbNet directory exists |
34 | DIR* dir; | 32 | DIR* dir; |
@@ -53,7 +51,8 @@ namespace verbly { | |||
53 | 51 | ||
54 | // Ensure WordNet tables exist | 52 | // Ensure WordNet tables exist |
55 | for (std::string table : { | 53 | for (std::string table : { |
56 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" | 54 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", |
55 | "sa", "sim", "syntax" | ||
57 | }) | 56 | }) |
58 | { | 57 | { |
59 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) | 58 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) |
@@ -166,13 +165,15 @@ namespace verbly { | |||
166 | void generator::readWordNetSynsets() | 165 | void generator::readWordNetSynsets() |
167 | { | 166 | { |
168 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | 167 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
169 | progress ppgs("Reading synsets from WordNet...", lines.size()); | 168 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); |
170 | 169 | ||
171 | for (std::string line : lines) | 170 | for (std::string line : lines) |
172 | { | 171 | { |
173 | ppgs.update(); | 172 | ppgs.update(); |
174 | 173 | ||
175 | std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); | 174 | std::regex relation( |
175 | "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); | ||
176 | |||
176 | std::smatch relation_data; | 177 | std::smatch relation_data; |
177 | if (!std::regex_search(line, relation_data, relation)) | 178 | if (!std::regex_search(line, relation_data, relation)) |
178 | { | 179 | { |
@@ -206,7 +207,10 @@ namespace verbly { | |||
206 | void generator::readAdjectivePositioning() | 207 | void generator::readAdjectivePositioning() |
207 | { | 208 | { |
208 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); | 209 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); |
209 | progress ppgs("Reading adjective positionings from WordNet...", lines.size()); | 210 | |
211 | hatkirby::progress ppgs( | ||
212 | "Reading adjective positionings from WordNet...", | ||
213 | lines.size()); | ||
210 | 214 | ||
211 | for (std::string line : lines) | 215 | for (std::string line : lines) |
212 | { | 216 | { |
@@ -279,7 +283,10 @@ namespace verbly { | |||
279 | void generator::readWordNetSenseKeys() | 283 | void generator::readWordNetSenseKeys() |
280 | { | 284 | { |
281 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); | 285 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); |
282 | progress ppgs("Reading sense keys from WordNet...", lines.size()); | 286 | |
287 | hatkirby::progress ppgs( | ||
288 | "Reading sense keys from WordNet...", | ||
289 | lines.size()); | ||
283 | 290 | ||
284 | for (std::string line : lines) | 291 | for (std::string line : lines) |
285 | { | 292 | { |
@@ -350,7 +357,8 @@ namespace verbly { | |||
350 | } | 357 | } |
351 | 358 | ||
352 | xmlNodePtr top = xmlDocGetRootElement(doc); | 359 | xmlNodePtr top = xmlDocGetRootElement(doc); |
353 | if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | 360 | if ((top == nullptr) || |
361 | (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | ||
354 | { | 362 | { |
355 | throw std::logic_error("Bad VerbNet file format: " + filename); | 363 | throw std::logic_error("Bad VerbNet file format: " + filename); |
356 | } | 364 | } |
@@ -360,7 +368,8 @@ namespace verbly { | |||
360 | createGroup(top); | 368 | createGroup(top); |
361 | } catch (const std::exception& e) | 369 | } catch (const std::exception& e) |
362 | { | 370 | { |
363 | std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); | 371 | std::throw_with_nested( |
372 | std::logic_error("Error parsing VerbNet file: " + filename)); | ||
364 | } | 373 | } |
365 | } | 374 | } |
366 | 375 | ||
@@ -370,7 +379,7 @@ namespace verbly { | |||
370 | void generator::readAgidInflections() | 379 | void generator::readAgidInflections() |
371 | { | 380 | { |
372 | std::list<std::string> lines(readFile(agidPath_)); | 381 | std::list<std::string> lines(readFile(agidPath_)); |
373 | progress ppgs("Reading inflections from AGID...", lines.size()); | 382 | hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); |
374 | 383 | ||
375 | for (std::string line : lines) | 384 | for (std::string line : lines) |
376 | { | 385 | { |
@@ -395,12 +404,17 @@ namespace verbly { | |||
395 | 404 | ||
396 | lemma& curLemma = lookupOrCreateLemma(infinitive); | 405 | lemma& curLemma = lookupOrCreateLemma(infinitive); |
397 | 406 | ||
407 | auto inflWordList = | ||
408 | hatkirby::split<std::list<std::string>>(line, " | "); | ||
409 | |||
398 | std::vector<std::list<std::string>> agidForms; | 410 | std::vector<std::list<std::string>> agidForms; |
399 | for (std::string inflForms : split<std::list<std::string>>(line, " | ")) | 411 | for (std::string inflForms : inflWordList) |
400 | { | 412 | { |
401 | std::list<std::string> forms; | 413 | auto inflFormList = |
414 | hatkirby::split<std::list<std::string>>(std::move(inflForms), ", "); | ||
402 | 415 | ||
403 | for (std::string inflForm : split<std::list<std::string>>(std::move(inflForms), ", ")) | 416 | std::list<std::string> forms; |
417 | for (std::string inflForm : inflFormList) | ||
404 | { | 418 | { |
405 | int sympos = inflForm.find_first_of("~<!? "); | 419 | int sympos = inflForm.find_first_of("~<!? "); |
406 | if (sympos != std::string::npos) | 420 | if (sympos != std::string::npos) |
@@ -443,7 +457,8 @@ namespace verbly { | |||
443 | // - may and shall do not conjugate the way we want them to | 457 | // - may and shall do not conjugate the way we want them to |
444 | // - methinks only has a past tense and is an outlier | 458 | // - methinks only has a past tense and is an outlier |
445 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | 459 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now |
446 | std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | 460 | std::cout << " Ignoring verb \"" << infinitive |
461 | << "\" due to non-standard number of forms." << std::endl; | ||
447 | } | 462 | } |
448 | 463 | ||
449 | // For verbs in particular, we sometimes create a notion and a word | 464 | // For verbs in particular, we sometimes create a notion and a word |
@@ -452,9 +467,13 @@ namespace verbly { | |||
452 | // that this verb appears in the AGID data but not in either WordNet | 467 | // that this verb appears in the AGID data but not in either WordNet |
453 | // or VerbNet. | 468 | // or VerbNet. |
454 | if (!wordsByBaseForm_.count(infinitive) | 469 | if (!wordsByBaseForm_.count(infinitive) |
455 | || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { | 470 | || !std::any_of( |
456 | return w->getNotion().getPartOfSpeech() == part_of_speech::verb; | 471 | std::begin(wordsByBaseForm_.at(infinitive)), |
457 | })) | 472 | std::end(wordsByBaseForm_.at(infinitive)), |
473 | [] (word* w) { | ||
474 | return (w->getNotion().getPartOfSpeech() == | ||
475 | part_of_speech::verb); | ||
476 | })) | ||
458 | { | 477 | { |
459 | notion& n = createNotion(part_of_speech::verb); | 478 | notion& n = createNotion(part_of_speech::verb); |
460 | createWord(n, curLemma); | 479 | createWord(n, curLemma); |
@@ -471,7 +490,8 @@ namespace verbly { | |||
471 | mappedForms[inflection::superlative] = agidForms[1]; | 490 | mappedForms[inflection::superlative] = agidForms[1]; |
472 | } else { | 491 | } else { |
473 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | 492 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" |
474 | std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | 493 | std::cout << " Ignoring adjective/adverb \"" << infinitive |
494 | << "\" due to non-standard number of forms." << std::endl; | ||
475 | } | 495 | } |
476 | 496 | ||
477 | break; | 497 | break; |
@@ -484,7 +504,8 @@ namespace verbly { | |||
484 | mappedForms[inflection::plural] = agidForms[0]; | 504 | mappedForms[inflection::plural] = agidForms[0]; |
485 | } else { | 505 | } else { |
486 | // As of AGID 2014.08.11, this is non-existent. | 506 | // As of AGID 2014.08.11, this is non-existent. |
487 | std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | 507 | std::cout << " Ignoring noun \"" << infinitive |
508 | << "\" due to non-standard number of forms." << std::endl; | ||
488 | } | 509 | } |
489 | 510 | ||
490 | break; | 511 | break; |
@@ -496,7 +517,9 @@ namespace verbly { | |||
496 | { | 517 | { |
497 | for (std::string infl : std::move(mapping.second)) | 518 | for (std::string infl : std::move(mapping.second)) |
498 | { | 519 | { |
499 | curLemma.addInflection(mapping.first, lookupOrCreateForm(std::move(infl))); | 520 | curLemma.addInflection( |
521 | mapping.first, | ||
522 | lookupOrCreateForm(std::move(infl))); | ||
500 | } | 523 | } |
501 | } | 524 | } |
502 | } | 525 | } |
@@ -505,7 +528,7 @@ namespace verbly { | |||
505 | void generator::readPrepositions() | 528 | void generator::readPrepositions() |
506 | { | 529 | { |
507 | std::list<std::string> lines(readFile("prepositions.txt")); | 530 | std::list<std::string> lines(readFile("prepositions.txt")); |
508 | progress ppgs("Reading prepositions...", lines.size()); | 531 | hatkirby::progress ppgs("Reading prepositions...", lines.size()); |
509 | 532 | ||
510 | for (std::string line : lines) | 533 | for (std::string line : lines) |
511 | { | 534 | { |
@@ -515,7 +538,9 @@ namespace verbly { | |||
515 | std::smatch relation_data; | 538 | std::smatch relation_data; |
516 | std::regex_search(line, relation_data, relation); | 539 | std::regex_search(line, relation_data, relation); |
517 | std::string prep = relation_data[1]; | 540 | std::string prep = relation_data[1]; |
518 | auto groups = split<std::list<std::string>>(relation_data[2], ", "); | 541 | |
542 | auto groups = | ||
543 | hatkirby::split<std::list<std::string>>(relation_data[2], ", "); | ||
519 | 544 | ||
520 | notion& n = createNotion(part_of_speech::preposition); | 545 | notion& n = createNotion(part_of_speech::preposition); |
521 | lemma& l = lookupOrCreateLemma(prep); | 546 | lemma& l = lookupOrCreateLemma(prep); |
@@ -528,7 +553,10 @@ namespace verbly { | |||
528 | void generator::readCmudictPronunciations() | 553 | void generator::readCmudictPronunciations() |
529 | { | 554 | { |
530 | std::list<std::string> lines(readFile(cmudictPath_)); | 555 | std::list<std::string> lines(readFile(cmudictPath_)); |
531 | progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); | 556 | |
557 | hatkirby::progress ppgs( | ||
558 | "Reading pronunciations from CMUDICT...", | ||
559 | lines.size()); | ||
532 | 560 | ||
533 | for (std::string line : lines) | 561 | for (std::string line : lines) |
534 | { | 562 | { |
@@ -538,8 +566,7 @@ namespace verbly { | |||
538 | std::smatch phoneme_data; | 566 | std::smatch phoneme_data; |
539 | if (std::regex_search(line, phoneme_data, phoneme)) | 567 | if (std::regex_search(line, phoneme_data, phoneme)) |
540 | { | 568 | { |
541 | std::string canonical(phoneme_data[1]); | 569 | std::string canonical = hatkirby::lowercase(phoneme_data[1]); |
542 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
543 | 570 | ||
544 | if (!formByText_.count(canonical)) | 571 | if (!formByText_.count(canonical)) |
545 | { | 572 | { |
@@ -575,13 +602,14 @@ namespace verbly { | |||
575 | } | 602 | } |
576 | 603 | ||
577 | std::string schema = schemaBuilder.str(); | 604 | std::string schema = schemaBuilder.str(); |
578 | auto queries = split<std::list<std::string>>(schema, ";"); | 605 | auto queries = hatkirby::split<std::list<std::string>>(schema, ";"); |
579 | progress ppgs("Writing database schema...", queries.size()); | 606 | |
607 | hatkirby::progress ppgs("Writing database schema...", queries.size()); | ||
580 | for (std::string query : queries) | 608 | for (std::string query : queries) |
581 | { | 609 | { |
582 | if (!queries.empty()) | 610 | if (!queries.empty()) |
583 | { | 611 | { |
584 | db_.runQuery(query); | 612 | db_.execute(query); |
585 | } | 613 | } |
586 | 614 | ||
587 | ppgs.update(); | 615 | ppgs.update(); |
@@ -590,10 +618,6 @@ namespace verbly { | |||
590 | 618 | ||
591 | void generator::writeVersion() | 619 | void generator::writeVersion() |
592 | { | 620 | { |
593 | std::list<field> fields; | ||
594 | fields.emplace_back("major", DATABASE_MAJOR_VERSION); | ||
595 | fields.emplace_back("minor", DATABASE_MINOR_VERSION); | ||
596 | |||
597 | db_.insertIntoTable( | 621 | db_.insertIntoTable( |
598 | "version", | 622 | "version", |
599 | { | 623 | { |
@@ -605,7 +629,7 @@ namespace verbly { | |||
605 | void generator::dumpObjects() | 629 | void generator::dumpObjects() |
606 | { | 630 | { |
607 | { | 631 | { |
608 | progress ppgs("Writing notions...", notions_.size()); | 632 | hatkirby::progress ppgs("Writing notions...", notions_.size()); |
609 | 633 | ||
610 | for (notion& n : notions_) | 634 | for (notion& n : notions_) |
611 | { | 635 | { |
@@ -616,7 +640,7 @@ namespace verbly { | |||
616 | } | 640 | } |
617 | 641 | ||
618 | { | 642 | { |
619 | progress ppgs("Writing words...", words_.size()); | 643 | hatkirby::progress ppgs("Writing words...", words_.size()); |
620 | 644 | ||
621 | for (word& w : words_) | 645 | for (word& w : words_) |
622 | { | 646 | { |
@@ -627,7 +651,7 @@ namespace verbly { | |||
627 | } | 651 | } |
628 | 652 | ||
629 | { | 653 | { |
630 | progress ppgs("Writing lemmas...", lemmas_.size()); | 654 | hatkirby::progress ppgs("Writing lemmas...", lemmas_.size()); |
631 | 655 | ||
632 | for (lemma& l : lemmas_) | 656 | for (lemma& l : lemmas_) |
633 | { | 657 | { |
@@ -638,7 +662,7 @@ namespace verbly { | |||
638 | } | 662 | } |
639 | 663 | ||
640 | { | 664 | { |
641 | progress ppgs("Writing forms...", forms_.size()); | 665 | hatkirby::progress ppgs("Writing forms...", forms_.size()); |
642 | 666 | ||
643 | for (form& f : forms_) | 667 | for (form& f : forms_) |
644 | { | 668 | { |
@@ -649,7 +673,7 @@ namespace verbly { | |||
649 | } | 673 | } |
650 | 674 | ||
651 | { | 675 | { |
652 | progress ppgs("Writing pronunciations...", pronunciations_.size()); | 676 | hatkirby::progress ppgs("Writing pronunciations...", pronunciations_.size()); |
653 | 677 | ||
654 | for (pronunciation& p : pronunciations_) | 678 | for (pronunciation& p : pronunciations_) |
655 | { | 679 | { |
@@ -660,7 +684,7 @@ namespace verbly { | |||
660 | } | 684 | } |
661 | 685 | ||
662 | { | 686 | { |
663 | progress ppgs("Writing verb frames...", groups_.size()); | 687 | hatkirby::progress ppgs("Writing verb frames...", groups_.size()); |
664 | 688 | ||
665 | for (group& g : groups_) | 689 | for (group& g : groups_) |
666 | { | 690 | { |
@@ -674,22 +698,30 @@ namespace verbly { | |||
674 | void generator::readWordNetAntonymy() | 698 | void generator::readWordNetAntonymy() |
675 | { | 699 | { |
676 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); | 700 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); |
677 | progress ppgs("Writing antonyms...", lines.size()); | 701 | hatkirby::progress ppgs("Writing antonyms...", lines.size()); |
678 | for (auto line : lines) | 702 | for (auto line : lines) |
679 | { | 703 | { |
680 | ppgs.update(); | 704 | ppgs.update(); |
681 | 705 | ||
682 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | 706 | std::regex relation( |
707 | "^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
708 | |||
683 | std::smatch relation_data; | 709 | std::smatch relation_data; |
684 | if (!std::regex_search(line, relation_data, relation)) | 710 | if (!std::regex_search(line, relation_data, relation)) |
685 | { | 711 | { |
686 | continue; | 712 | continue; |
687 | } | 713 | } |
688 | 714 | ||
689 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | 715 | std::pair<int, int> lookup1( |
690 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | 716 | std::stoi(relation_data[1]), |
717 | std::stoi(relation_data[2])); | ||
718 | |||
719 | std::pair<int, int> lookup2( | ||
720 | std::stoi(relation_data[3]), | ||
721 | std::stoi(relation_data[4])); | ||
691 | 722 | ||
692 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | 723 | if (wordByWnidAndWnum_.count(lookup1) && |
724 | wordByWnidAndWnum_.count(lookup2)) | ||
693 | { | 725 | { |
694 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | 726 | word& word1 = *wordByWnidAndWnum_.at(lookup1); |
695 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | 727 | word& word2 = *wordByWnidAndWnum_.at(lookup2); |
@@ -707,7 +739,7 @@ namespace verbly { | |||
707 | void generator::readWordNetVariation() | 739 | void generator::readWordNetVariation() |
708 | { | 740 | { |
709 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); | 741 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); |
710 | progress ppgs("Writing variation...", lines.size()); | 742 | hatkirby::progress ppgs("Writing variation...", lines.size()); |
711 | for (auto line : lines) | 743 | for (auto line : lines) |
712 | { | 744 | { |
713 | ppgs.update(); | 745 | ppgs.update(); |
@@ -730,7 +762,7 @@ namespace verbly { | |||
730 | db_.insertIntoTable( | 762 | db_.insertIntoTable( |
731 | "variation", | 763 | "variation", |
732 | { | 764 | { |
733 | { "noun_id", notion1.getId() } | 765 | { "noun_id", notion1.getId() }, |
734 | { "adjective_id", notion2.getId() } | 766 | { "adjective_id", notion2.getId() } |
735 | }); | 767 | }); |
736 | } | 768 | } |
@@ -740,20 +772,32 @@ namespace verbly { | |||
740 | void generator::readWordNetClasses() | 772 | void generator::readWordNetClasses() |
741 | { | 773 | { |
742 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); | 774 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); |
743 | progress ppgs("Writing usage, topicality, and regionality...", lines.size()); | 775 | |
776 | hatkirby::progress ppgs( | ||
777 | "Writing usage, topicality, and regionality...", | ||
778 | lines.size()); | ||
779 | |||
744 | for (auto line : lines) | 780 | for (auto line : lines) |
745 | { | 781 | { |
746 | ppgs.update(); | 782 | ppgs.update(); |
747 | 783 | ||
748 | std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); | 784 | std::regex relation( |
785 | "^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); | ||
786 | |||
749 | std::smatch relation_data; | 787 | std::smatch relation_data; |
750 | if (!std::regex_search(line, relation_data, relation)) | 788 | if (!std::regex_search(line, relation_data, relation)) |
751 | { | 789 | { |
752 | continue; | 790 | continue; |
753 | } | 791 | } |
754 | 792 | ||
755 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | 793 | std::pair<int, int> lookup1( |
756 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | 794 | std::stoi(relation_data[1]), |
795 | std::stoi(relation_data[2])); | ||
796 | |||
797 | std::pair<int, int> lookup2( | ||
798 | std::stoi(relation_data[3]), | ||
799 | std::stoi(relation_data[4])); | ||
800 | |||
757 | std::string class_type = relation_data[5]; | 801 | std::string class_type = relation_data[5]; |
758 | 802 | ||
759 | std::string table_name; | 803 | std::string table_name; |
@@ -773,18 +817,30 @@ namespace verbly { | |||
773 | 817 | ||
774 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) | 818 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) |
775 | { | 819 | { |
776 | std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { | 820 | auto& wordSet = wordsByWnid_.at(lookup1.first); |
777 | return w->getId(); | 821 | |
778 | }); | 822 | std::transform( |
823 | std::begin(wordSet), | ||
824 | std::end(wordSet), | ||
825 | std::back_inserter(leftJoin), | ||
826 | [] (word* w) { | ||
827 | return w->getId(); | ||
828 | }); | ||
779 | } else if (wordByWnidAndWnum_.count(lookup1)) { | 829 | } else if (wordByWnidAndWnum_.count(lookup1)) { |
780 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); | 830 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); |
781 | } | 831 | } |
782 | 832 | ||
783 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) | 833 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) |
784 | { | 834 | { |
785 | std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { | 835 | auto& wordSet = wordsByWnid_.at(lookup2.first); |
786 | return w->getId(); | 836 | |
787 | }); | 837 | std::transform( |
838 | std::begin(wordSet), | ||
839 | std::end(wordSet), | ||
840 | std::back_inserter(rightJoin), | ||
841 | [] (word* w) { | ||
842 | return w->getId(); | ||
843 | }); | ||
788 | } else if (wordByWnidAndWnum_.count(lookup2)) { | 844 | } else if (wordByWnidAndWnum_.count(lookup2)) { |
789 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); | 845 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); |
790 | } | 846 | } |
@@ -807,7 +863,7 @@ namespace verbly { | |||
807 | void generator::readWordNetCausality() | 863 | void generator::readWordNetCausality() |
808 | { | 864 | { |
809 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); | 865 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); |
810 | progress ppgs("Writing causality...", lines.size()); | 866 | hatkirby::progress ppgs("Writing causality...", lines.size()); |
811 | for (auto line : lines) | 867 | for (auto line : lines) |
812 | { | 868 | { |
813 | ppgs.update(); | 869 | ppgs.update(); |
@@ -840,7 +896,7 @@ namespace verbly { | |||
840 | void generator::readWordNetEntailment() | 896 | void generator::readWordNetEntailment() |
841 | { | 897 | { |
842 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); | 898 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); |
843 | progress ppgs("Writing entailment...", lines.size()); | 899 | hatkirby::progress ppgs("Writing entailment...", lines.size()); |
844 | for (auto line : lines) | 900 | for (auto line : lines) |
845 | { | 901 | { |
846 | ppgs.update(); | 902 | ppgs.update(); |
@@ -873,7 +929,7 @@ namespace verbly { | |||
873 | void generator::readWordNetHypernymy() | 929 | void generator::readWordNetHypernymy() |
874 | { | 930 | { |
875 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); | 931 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); |
876 | progress ppgs("Writing hypernymy...", lines.size()); | 932 | hatkirby::progress ppgs("Writing hypernymy...", lines.size()); |
877 | for (auto line : lines) | 933 | for (auto line : lines) |
878 | { | 934 | { |
879 | ppgs.update(); | 935 | ppgs.update(); |
@@ -906,7 +962,7 @@ namespace verbly { | |||
906 | void generator::readWordNetInstantiation() | 962 | void generator::readWordNetInstantiation() |
907 | { | 963 | { |
908 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); | 964 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); |
909 | progress ppgs("Writing instantiation...", lines.size()); | 965 | hatkirby::progress ppgs("Writing instantiation...", lines.size()); |
910 | for (auto line : lines) | 966 | for (auto line : lines) |
911 | { | 967 | { |
912 | ppgs.update(); | 968 | ppgs.update(); |
@@ -939,7 +995,7 @@ namespace verbly { | |||
939 | void generator::readWordNetMemberMeronymy() | 995 | void generator::readWordNetMemberMeronymy() |
940 | { | 996 | { |
941 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); | 997 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); |
942 | progress ppgs("Writing member meronymy...", lines.size()); | 998 | hatkirby::progress ppgs("Writing member meronymy...", lines.size()); |
943 | for (auto line : lines) | 999 | for (auto line : lines) |
944 | { | 1000 | { |
945 | ppgs.update(); | 1001 | ppgs.update(); |
@@ -972,7 +1028,7 @@ namespace verbly { | |||
972 | void generator::readWordNetPartMeronymy() | 1028 | void generator::readWordNetPartMeronymy() |
973 | { | 1029 | { |
974 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); | 1030 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); |
975 | progress ppgs("Writing part meronymy...", lines.size()); | 1031 | hatkirby::progress ppgs("Writing part meronymy...", lines.size()); |
976 | for (auto line : lines) | 1032 | for (auto line : lines) |
977 | { | 1033 | { |
978 | ppgs.update(); | 1034 | ppgs.update(); |
@@ -1005,7 +1061,7 @@ namespace verbly { | |||
1005 | void generator::readWordNetSubstanceMeronymy() | 1061 | void generator::readWordNetSubstanceMeronymy() |
1006 | { | 1062 | { |
1007 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); | 1063 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); |
1008 | progress ppgs("Writing substance meronymy...", lines.size()); | 1064 | hatkirby::progress ppgs("Writing substance meronymy...", lines.size()); |
1009 | for (auto line : lines) | 1065 | for (auto line : lines) |
1010 | { | 1066 | { |
1011 | ppgs.update(); | 1067 | ppgs.update(); |
@@ -1038,27 +1094,40 @@ namespace verbly { | |||
1038 | void generator::readWordNetPertainymy() | 1094 | void generator::readWordNetPertainymy() |
1039 | { | 1095 | { |
1040 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); | 1096 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); |
1041 | progress ppgs("Writing pertainymy and mannernymy...", lines.size()); | 1097 | |
1098 | hatkirby::progress ppgs( | ||
1099 | "Writing pertainymy and mannernymy...", | ||
1100 | lines.size()); | ||
1101 | |||
1042 | for (auto line : lines) | 1102 | for (auto line : lines) |
1043 | { | 1103 | { |
1044 | ppgs.update(); | 1104 | ppgs.update(); |
1045 | 1105 | ||
1046 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | 1106 | std::regex relation( |
1107 | "^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | ||
1108 | |||
1047 | std::smatch relation_data; | 1109 | std::smatch relation_data; |
1048 | if (!std::regex_search(line, relation_data, relation)) | 1110 | if (!std::regex_search(line, relation_data, relation)) |
1049 | { | 1111 | { |
1050 | continue; | 1112 | continue; |
1051 | } | 1113 | } |
1052 | 1114 | ||
1053 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | 1115 | std::pair<int, int> lookup1( |
1054 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | 1116 | std::stoi(relation_data[1]), |
1117 | std::stoi(relation_data[2])); | ||
1055 | 1118 | ||
1056 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | 1119 | std::pair<int, int> lookup2( |
1120 | std::stoi(relation_data[3]), | ||
1121 | std::stoi(relation_data[4])); | ||
1122 | |||
1123 | if (wordByWnidAndWnum_.count(lookup1) && | ||
1124 | wordByWnidAndWnum_.count(lookup2)) | ||
1057 | { | 1125 | { |
1058 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | 1126 | word& word1 = *wordByWnidAndWnum_.at(lookup1); |
1059 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | 1127 | word& word2 = *wordByWnidAndWnum_.at(lookup2); |
1060 | 1128 | ||
1061 | if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) | 1129 | if (word1.getNotion().getPartOfSpeech() == |
1130 | part_of_speech::adjective) | ||
1062 | { | 1131 | { |
1063 | db_.insertIntoTable( | 1132 | db_.insertIntoTable( |
1064 | "pertainymy", | 1133 | "pertainymy", |
@@ -1066,7 +1135,8 @@ namespace verbly { | |||
1066 | { "pertainym_id", word1.getId() }, | 1135 | { "pertainym_id", word1.getId() }, |
1067 | { "noun_id", word2.getId() } | 1136 | { "noun_id", word2.getId() } |
1068 | }); | 1137 | }); |
1069 | } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) | 1138 | } else if (word1.getNotion().getPartOfSpeech() == |
1139 | part_of_speech::adverb) | ||
1070 | { | 1140 | { |
1071 | db_.insertIntoTable( | 1141 | db_.insertIntoTable( |
1072 | "mannernymy", | 1142 | "mannernymy", |
@@ -1082,7 +1152,7 @@ namespace verbly { | |||
1082 | void generator::readWordNetSpecification() | 1152 | void generator::readWordNetSpecification() |
1083 | { | 1153 | { |
1084 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); | 1154 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); |
1085 | progress ppgs("Writing specifications...", lines.size()); | 1155 | hatkirby::progress ppgs("Writing specifications...", lines.size()); |
1086 | for (auto line : lines) | 1156 | for (auto line : lines) |
1087 | { | 1157 | { |
1088 | ppgs.update(); | 1158 | ppgs.update(); |
@@ -1094,10 +1164,17 @@ namespace verbly { | |||
1094 | continue; | 1164 | continue; |
1095 | } | 1165 | } |
1096 | 1166 | ||
1097 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | 1167 | std::pair<int, int> lookup1( |
1098 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | 1168 | std::stoi(relation_data[1]), |
1169 | std::stoi(relation_data[2])); | ||
1170 | |||
1171 | std::pair<int, int> lookup2( | ||
1172 | std::stoi(relation_data[3]), | ||
1173 | std::stoi(relation_data[4])); | ||
1099 | 1174 | ||
1100 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | 1175 | |
1176 | if (wordByWnidAndWnum_.count(lookup1) && | ||
1177 | wordByWnidAndWnum_.count(lookup2)) | ||
1101 | { | 1178 | { |
1102 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | 1179 | word& word1 = *wordByWnidAndWnum_.at(lookup1); |
1103 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | 1180 | word& word2 = *wordByWnidAndWnum_.at(lookup2); |
@@ -1115,7 +1192,7 @@ namespace verbly { | |||
1115 | void generator::readWordNetSimilarity() | 1192 | void generator::readWordNetSimilarity() |
1116 | { | 1193 | { |
1117 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); | 1194 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); |
1118 | progress ppgs("Writing adjective similarity...", lines.size()); | 1195 | hatkirby::progress ppgs("Writing adjective similarity...", lines.size()); |
1119 | for (auto line : lines) | 1196 | for (auto line : lines) |
1120 | { | 1197 | { |
1121 | ppgs.update(); | 1198 | ppgs.update(); |
@@ -1149,7 +1226,7 @@ namespace verbly { | |||
1149 | { | 1226 | { |
1150 | std::cout << "Analyzing data..." << std::endl; | 1227 | std::cout << "Analyzing data..." << std::endl; |
1151 | 1228 | ||
1152 | db_.runQuery("ANALYZE"); | 1229 | db_.execute("ANALYZE"); |
1153 | } | 1230 | } |
1154 | 1231 | ||
1155 | std::list<std::string> generator::readFile(std::string path) | 1232 | std::list<std::string> generator::readFile(std::string path) |
@@ -1183,7 +1260,8 @@ namespace verbly { | |||
1183 | case 2: return part_of_speech::verb; | 1260 | case 2: return part_of_speech::verb; |
1184 | case 3: return part_of_speech::adjective; | 1261 | case 3: return part_of_speech::adjective; |
1185 | case 4: return part_of_speech::adverb; | 1262 | case 4: return part_of_speech::adverb; |
1186 | default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); | 1263 | default: throw std::domain_error( |
1264 | "Invalid WordNet synset ID: " + std::to_string(wnid)); | ||
1187 | } | 1265 | } |
1188 | } | 1266 | } |
1189 | 1267 | ||
@@ -1296,20 +1374,30 @@ namespace verbly { | |||
1296 | std::string wnSenses(reinterpret_cast<const char*>(key)); | 1374 | std::string wnSenses(reinterpret_cast<const char*>(key)); |
1297 | xmlFree(key); | 1375 | xmlFree(key); |
1298 | 1376 | ||
1299 | auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); | 1377 | auto wnSenseKeys = |
1378 | hatkirby::split<std::list<std::string>>(wnSenses, " "); | ||
1379 | |||
1300 | if (!wnSenseKeys.empty()) | 1380 | if (!wnSenseKeys.empty()) |
1301 | { | 1381 | { |
1302 | std::list<std::string> tempKeys; | 1382 | std::list<std::string> tempKeys; |
1303 | 1383 | ||
1304 | std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { | 1384 | std::transform( |
1305 | return sense + "::"; | 1385 | std::begin(wnSenseKeys), |
1306 | }); | 1386 | std::end(wnSenseKeys), |
1387 | std::back_inserter(tempKeys), | ||
1388 | [] (std::string sense) { | ||
1389 | return sense + "::"; | ||
1390 | }); | ||
1307 | 1391 | ||
1308 | std::list<std::string> filteredKeys; | 1392 | std::list<std::string> filteredKeys; |
1309 | 1393 | ||
1310 | std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { | 1394 | std::remove_copy_if( |
1311 | return !wnSenseKeys_.count(sense); | 1395 | std::begin(tempKeys), |
1312 | }); | 1396 | std::end(tempKeys), |
1397 | std::back_inserter(filteredKeys), | ||
1398 | [&] (std::string sense) { | ||
1399 | return !wnSenseKeys_.count(sense); | ||
1400 | }); | ||
1313 | 1401 | ||
1314 | wnSenseKeys = std::move(filteredKeys); | 1402 | wnSenseKeys = std::move(filteredKeys); |
1315 | } | 1403 | } |
@@ -1431,10 +1519,15 @@ namespace verbly { | |||
1431 | std::string choicesStr = reinterpret_cast<const char*>(key); | 1519 | std::string choicesStr = reinterpret_cast<const char*>(key); |
1432 | xmlFree(key); | 1520 | xmlFree(key); |
1433 | 1521 | ||
1434 | for (std::string choice : split<std::list<std::string>>(choicesStr, " ")) | 1522 | auto choices = |
1523 | hatkirby::split<std::list<std::string>>( | ||
1524 | choicesStr, " "); | ||
1525 | |||
1526 | for (std::string choice : choices) | ||
1435 | { | 1527 | { |
1436 | int chloc; | 1528 | int chloc; |
1437 | while ((chloc = choice.find_first_of("_")) != std::string::npos) | 1529 | while ((chloc = choice.find_first_of("_")) |
1530 | != std::string::npos) | ||
1438 | { | 1531 | { |
1439 | choice.replace(chloc, 1, " "); | 1532 | choice.replace(chloc, 1, " "); |
1440 | } | 1533 | } |
@@ -1444,7 +1537,9 @@ namespace verbly { | |||
1444 | } else { | 1537 | } else { |
1445 | partLiteral = false; | 1538 | partLiteral = false; |
1446 | 1539 | ||
1447 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | 1540 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; |
1541 | npnode != nullptr; | ||
1542 | npnode = npnode->next) | ||
1448 | { | 1543 | { |
1449 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | 1544 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) |
1450 | { | 1545 | { |