summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2018-03-31 23:05:02 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2018-03-31 23:05:02 -0400
commit75e947fa0021547f460496d1c3aef5b61af4c669 (patch)
tree64559a5329b3e5983ffdfe7ee2ad65c7c938e98d /generator/generator.cpp
parent3554df2e34e63364eea3a7998e0dfb0e6be65ca4 (diff)
downloadverbly-75e947fa0021547f460496d1c3aef5b61af4c669.tar.gz
verbly-75e947fa0021547f460496d1c3aef5b61af4c669.tar.bz2
verbly-75e947fa0021547f460496d1c3aef5b61af4c669.zip
Migrated generator to hkutil
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp275
1 files changed, 185 insertions, 90 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index e34ca69..785ec87 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,16 +1,14 @@
1#include "generator.h" 1#include "generator.h"
2#include <cassert>
3#include <stdexcept> 2#include <stdexcept>
4#include <iostream> 3#include <iostream>
5#include <regex> 4#include <regex>
6#include <dirent.h> 5#include <dirent.h>
7#include <fstream> 6#include <fstream>
8#include "../lib/enums.h" 7#include <hkutil/string.h>
9#include "progress.h" 8#include <hkutil/progress.h>
10#include "role.h" 9#include "role.h"
11#include "part.h" 10#include "part.h"
12#include "field.h" 11#include "../lib/enums.h"
13#include "../lib/util.h"
14#include "../lib/version.h" 12#include "../lib/version.h"
15 13
16namespace verbly { 14namespace verbly {
@@ -28,7 +26,7 @@ namespace verbly {
28 wordNetPath_(wordNetPath), 26 wordNetPath_(wordNetPath),
29 cmudictPath_(cmudictPath), 27 cmudictPath_(cmudictPath),
30 imageNetPath_(imageNetPath), 28 imageNetPath_(imageNetPath),
31 db_(outputPath) 29 db_(outputPath, hatkirby::dbmode::create)
32 { 30 {
33 // Ensure VerbNet directory exists 31 // Ensure VerbNet directory exists
34 DIR* dir; 32 DIR* dir;
@@ -53,7 +51,8 @@ namespace verbly {
53 51
54 // Ensure WordNet tables exist 52 // Ensure WordNet tables exist
55 for (std::string table : { 53 for (std::string table : {
56 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" 54 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per",
55 "sa", "sim", "syntax"
57 }) 56 })
58 { 57 {
59 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) 58 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl"))
@@ -166,13 +165,15 @@ namespace verbly {
166 void generator::readWordNetSynsets() 165 void generator::readWordNetSynsets()
167 { 166 {
168 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); 167 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
169 progress ppgs("Reading synsets from WordNet...", lines.size()); 168 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
170 169
171 for (std::string line : lines) 170 for (std::string line : lines)
172 { 171 {
173 ppgs.update(); 172 ppgs.update();
174 173
175 std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); 174 std::regex relation(
175 "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
176
176 std::smatch relation_data; 177 std::smatch relation_data;
177 if (!std::regex_search(line, relation_data, relation)) 178 if (!std::regex_search(line, relation_data, relation))
178 { 179 {
@@ -206,7 +207,10 @@ namespace verbly {
206 void generator::readAdjectivePositioning() 207 void generator::readAdjectivePositioning()
207 { 208 {
208 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); 209 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
209 progress ppgs("Reading adjective positionings from WordNet...", lines.size()); 210
211 hatkirby::progress ppgs(
212 "Reading adjective positionings from WordNet...",
213 lines.size());
210 214
211 for (std::string line : lines) 215 for (std::string line : lines)
212 { 216 {
@@ -279,7 +283,10 @@ namespace verbly {
279 void generator::readWordNetSenseKeys() 283 void generator::readWordNetSenseKeys()
280 { 284 {
281 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); 285 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
282 progress ppgs("Reading sense keys from WordNet...", lines.size()); 286
287 hatkirby::progress ppgs(
288 "Reading sense keys from WordNet...",
289 lines.size());
283 290
284 for (std::string line : lines) 291 for (std::string line : lines)
285 { 292 {
@@ -350,7 +357,8 @@ namespace verbly {
350 } 357 }
351 358
352 xmlNodePtr top = xmlDocGetRootElement(doc); 359 xmlNodePtr top = xmlDocGetRootElement(doc);
353 if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) 360 if ((top == nullptr) ||
361 (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
354 { 362 {
355 throw std::logic_error("Bad VerbNet file format: " + filename); 363 throw std::logic_error("Bad VerbNet file format: " + filename);
356 } 364 }
@@ -360,7 +368,8 @@ namespace verbly {
360 createGroup(top); 368 createGroup(top);
361 } catch (const std::exception& e) 369 } catch (const std::exception& e)
362 { 370 {
363 std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); 371 std::throw_with_nested(
372 std::logic_error("Error parsing VerbNet file: " + filename));
364 } 373 }
365 } 374 }
366 375
@@ -370,7 +379,7 @@ namespace verbly {
370 void generator::readAgidInflections() 379 void generator::readAgidInflections()
371 { 380 {
372 std::list<std::string> lines(readFile(agidPath_)); 381 std::list<std::string> lines(readFile(agidPath_));
373 progress ppgs("Reading inflections from AGID...", lines.size()); 382 hatkirby::progress ppgs("Reading inflections from AGID...", lines.size());
374 383
375 for (std::string line : lines) 384 for (std::string line : lines)
376 { 385 {
@@ -395,12 +404,17 @@ namespace verbly {
395 404
396 lemma& curLemma = lookupOrCreateLemma(infinitive); 405 lemma& curLemma = lookupOrCreateLemma(infinitive);
397 406
407 auto inflWordList =
408 hatkirby::split<std::list<std::string>>(line, " | ");
409
398 std::vector<std::list<std::string>> agidForms; 410 std::vector<std::list<std::string>> agidForms;
399 for (std::string inflForms : split<std::list<std::string>>(line, " | ")) 411 for (std::string inflForms : inflWordList)
400 { 412 {
401 std::list<std::string> forms; 413 auto inflFormList =
414 hatkirby::split<std::list<std::string>>(std::move(inflForms), ", ");
402 415
403 for (std::string inflForm : split<std::list<std::string>>(std::move(inflForms), ", ")) 416 std::list<std::string> forms;
417 for (std::string inflForm : inflFormList)
404 { 418 {
405 int sympos = inflForm.find_first_of("~<!? "); 419 int sympos = inflForm.find_first_of("~<!? ");
406 if (sympos != std::string::npos) 420 if (sympos != std::string::npos)
@@ -443,7 +457,8 @@ namespace verbly {
443 // - may and shall do not conjugate the way we want them to 457 // - may and shall do not conjugate the way we want them to
444 // - methinks only has a past tense and is an outlier 458 // - methinks only has a past tense and is an outlier
445 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now 459 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
446 std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; 460 std::cout << " Ignoring verb \"" << infinitive
461 << "\" due to non-standard number of forms." << std::endl;
447 } 462 }
448 463
449 // For verbs in particular, we sometimes create a notion and a word 464 // For verbs in particular, we sometimes create a notion and a word
@@ -452,9 +467,13 @@ namespace verbly {
452 // that this verb appears in the AGID data but not in either WordNet 467 // that this verb appears in the AGID data but not in either WordNet
453 // or VerbNet. 468 // or VerbNet.
454 if (!wordsByBaseForm_.count(infinitive) 469 if (!wordsByBaseForm_.count(infinitive)
455 || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { 470 || !std::any_of(
456 return w->getNotion().getPartOfSpeech() == part_of_speech::verb; 471 std::begin(wordsByBaseForm_.at(infinitive)),
457 })) 472 std::end(wordsByBaseForm_.at(infinitive)),
473 [] (word* w) {
474 return (w->getNotion().getPartOfSpeech() ==
475 part_of_speech::verb);
476 }))
458 { 477 {
459 notion& n = createNotion(part_of_speech::verb); 478 notion& n = createNotion(part_of_speech::verb);
460 createWord(n, curLemma); 479 createWord(n, curLemma);
@@ -471,7 +490,8 @@ namespace verbly {
471 mappedForms[inflection::superlative] = agidForms[1]; 490 mappedForms[inflection::superlative] = agidForms[1];
472 } else { 491 } else {
473 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" 492 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
474 std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; 493 std::cout << " Ignoring adjective/adverb \"" << infinitive
494 << "\" due to non-standard number of forms." << std::endl;
475 } 495 }
476 496
477 break; 497 break;
@@ -484,7 +504,8 @@ namespace verbly {
484 mappedForms[inflection::plural] = agidForms[0]; 504 mappedForms[inflection::plural] = agidForms[0];
485 } else { 505 } else {
486 // As of AGID 2014.08.11, this is non-existent. 506 // As of AGID 2014.08.11, this is non-existent.
487 std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; 507 std::cout << " Ignoring noun \"" << infinitive
508 << "\" due to non-standard number of forms." << std::endl;
488 } 509 }
489 510
490 break; 511 break;
@@ -496,7 +517,9 @@ namespace verbly {
496 { 517 {
497 for (std::string infl : std::move(mapping.second)) 518 for (std::string infl : std::move(mapping.second))
498 { 519 {
499 curLemma.addInflection(mapping.first, lookupOrCreateForm(std::move(infl))); 520 curLemma.addInflection(
521 mapping.first,
522 lookupOrCreateForm(std::move(infl)));
500 } 523 }
501 } 524 }
502 } 525 }
@@ -505,7 +528,7 @@ namespace verbly {
505 void generator::readPrepositions() 528 void generator::readPrepositions()
506 { 529 {
507 std::list<std::string> lines(readFile("prepositions.txt")); 530 std::list<std::string> lines(readFile("prepositions.txt"));
508 progress ppgs("Reading prepositions...", lines.size()); 531 hatkirby::progress ppgs("Reading prepositions...", lines.size());
509 532
510 for (std::string line : lines) 533 for (std::string line : lines)
511 { 534 {
@@ -515,7 +538,9 @@ namespace verbly {
515 std::smatch relation_data; 538 std::smatch relation_data;
516 std::regex_search(line, relation_data, relation); 539 std::regex_search(line, relation_data, relation);
517 std::string prep = relation_data[1]; 540 std::string prep = relation_data[1];
518 auto groups = split<std::list<std::string>>(relation_data[2], ", "); 541
542 auto groups =
543 hatkirby::split<std::list<std::string>>(relation_data[2], ", ");
519 544
520 notion& n = createNotion(part_of_speech::preposition); 545 notion& n = createNotion(part_of_speech::preposition);
521 lemma& l = lookupOrCreateLemma(prep); 546 lemma& l = lookupOrCreateLemma(prep);
@@ -528,7 +553,10 @@ namespace verbly {
528 void generator::readCmudictPronunciations() 553 void generator::readCmudictPronunciations()
529 { 554 {
530 std::list<std::string> lines(readFile(cmudictPath_)); 555 std::list<std::string> lines(readFile(cmudictPath_));
531 progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); 556
557 hatkirby::progress ppgs(
558 "Reading pronunciations from CMUDICT...",
559 lines.size());
532 560
533 for (std::string line : lines) 561 for (std::string line : lines)
534 { 562 {
@@ -538,8 +566,7 @@ namespace verbly {
538 std::smatch phoneme_data; 566 std::smatch phoneme_data;
539 if (std::regex_search(line, phoneme_data, phoneme)) 567 if (std::regex_search(line, phoneme_data, phoneme))
540 { 568 {
541 std::string canonical(phoneme_data[1]); 569 std::string canonical = hatkirby::lowercase(phoneme_data[1]);
542 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
543 570
544 if (!formByText_.count(canonical)) 571 if (!formByText_.count(canonical))
545 { 572 {
@@ -575,13 +602,14 @@ namespace verbly {
575 } 602 }
576 603
577 std::string schema = schemaBuilder.str(); 604 std::string schema = schemaBuilder.str();
578 auto queries = split<std::list<std::string>>(schema, ";"); 605 auto queries = hatkirby::split<std::list<std::string>>(schema, ";");
579 progress ppgs("Writing database schema...", queries.size()); 606
607 hatkirby::progress ppgs("Writing database schema...", queries.size());
580 for (std::string query : queries) 608 for (std::string query : queries)
581 { 609 {
582 if (!queries.empty()) 610 if (!queries.empty())
583 { 611 {
584 db_.runQuery(query); 612 db_.execute(query);
585 } 613 }
586 614
587 ppgs.update(); 615 ppgs.update();
@@ -590,10 +618,6 @@ namespace verbly {
590 618
591 void generator::writeVersion() 619 void generator::writeVersion()
592 { 620 {
593 std::list<field> fields;
594 fields.emplace_back("major", DATABASE_MAJOR_VERSION);
595 fields.emplace_back("minor", DATABASE_MINOR_VERSION);
596
597 db_.insertIntoTable( 621 db_.insertIntoTable(
598 "version", 622 "version",
599 { 623 {
@@ -605,7 +629,7 @@ namespace verbly {
605 void generator::dumpObjects() 629 void generator::dumpObjects()
606 { 630 {
607 { 631 {
608 progress ppgs("Writing notions...", notions_.size()); 632 hatkirby::progress ppgs("Writing notions...", notions_.size());
609 633
610 for (notion& n : notions_) 634 for (notion& n : notions_)
611 { 635 {
@@ -616,7 +640,7 @@ namespace verbly {
616 } 640 }
617 641
618 { 642 {
619 progress ppgs("Writing words...", words_.size()); 643 hatkirby::progress ppgs("Writing words...", words_.size());
620 644
621 for (word& w : words_) 645 for (word& w : words_)
622 { 646 {
@@ -627,7 +651,7 @@ namespace verbly {
627 } 651 }
628 652
629 { 653 {
630 progress ppgs("Writing lemmas...", lemmas_.size()); 654 hatkirby::progress ppgs("Writing lemmas...", lemmas_.size());
631 655
632 for (lemma& l : lemmas_) 656 for (lemma& l : lemmas_)
633 { 657 {
@@ -638,7 +662,7 @@ namespace verbly {
638 } 662 }
639 663
640 { 664 {
641 progress ppgs("Writing forms...", forms_.size()); 665 hatkirby::progress ppgs("Writing forms...", forms_.size());
642 666
643 for (form& f : forms_) 667 for (form& f : forms_)
644 { 668 {
@@ -649,7 +673,7 @@ namespace verbly {
649 } 673 }
650 674
651 { 675 {
652 progress ppgs("Writing pronunciations...", pronunciations_.size()); 676 hatkirby::progress ppgs("Writing pronunciations...", pronunciations_.size());
653 677
654 for (pronunciation& p : pronunciations_) 678 for (pronunciation& p : pronunciations_)
655 { 679 {
@@ -660,7 +684,7 @@ namespace verbly {
660 } 684 }
661 685
662 { 686 {
663 progress ppgs("Writing verb frames...", groups_.size()); 687 hatkirby::progress ppgs("Writing verb frames...", groups_.size());
664 688
665 for (group& g : groups_) 689 for (group& g : groups_)
666 { 690 {
@@ -674,22 +698,30 @@ namespace verbly {
674 void generator::readWordNetAntonymy() 698 void generator::readWordNetAntonymy()
675 { 699 {
676 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); 700 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
677 progress ppgs("Writing antonyms...", lines.size()); 701 hatkirby::progress ppgs("Writing antonyms...", lines.size());
678 for (auto line : lines) 702 for (auto line : lines)
679 { 703 {
680 ppgs.update(); 704 ppgs.update();
681 705
682 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); 706 std::regex relation(
707 "^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
708
683 std::smatch relation_data; 709 std::smatch relation_data;
684 if (!std::regex_search(line, relation_data, relation)) 710 if (!std::regex_search(line, relation_data, relation))
685 { 711 {
686 continue; 712 continue;
687 } 713 }
688 714
689 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); 715 std::pair<int, int> lookup1(
690 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); 716 std::stoi(relation_data[1]),
717 std::stoi(relation_data[2]));
718
719 std::pair<int, int> lookup2(
720 std::stoi(relation_data[3]),
721 std::stoi(relation_data[4]));
691 722
692 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) 723 if (wordByWnidAndWnum_.count(lookup1) &&
724 wordByWnidAndWnum_.count(lookup2))
693 { 725 {
694 word& word1 = *wordByWnidAndWnum_.at(lookup1); 726 word& word1 = *wordByWnidAndWnum_.at(lookup1);
695 word& word2 = *wordByWnidAndWnum_.at(lookup2); 727 word& word2 = *wordByWnidAndWnum_.at(lookup2);
@@ -707,7 +739,7 @@ namespace verbly {
707 void generator::readWordNetVariation() 739 void generator::readWordNetVariation()
708 { 740 {
709 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); 741 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
710 progress ppgs("Writing variation...", lines.size()); 742 hatkirby::progress ppgs("Writing variation...", lines.size());
711 for (auto line : lines) 743 for (auto line : lines)
712 { 744 {
713 ppgs.update(); 745 ppgs.update();
@@ -730,7 +762,7 @@ namespace verbly {
730 db_.insertIntoTable( 762 db_.insertIntoTable(
731 "variation", 763 "variation",
732 { 764 {
733 { "noun_id", notion1.getId() } 765 { "noun_id", notion1.getId() },
734 { "adjective_id", notion2.getId() } 766 { "adjective_id", notion2.getId() }
735 }); 767 });
736 } 768 }
@@ -740,20 +772,32 @@ namespace verbly {
740 void generator::readWordNetClasses() 772 void generator::readWordNetClasses()
741 { 773 {
742 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); 774 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
743 progress ppgs("Writing usage, topicality, and regionality...", lines.size()); 775
776 hatkirby::progress ppgs(
777 "Writing usage, topicality, and regionality...",
778 lines.size());
779
744 for (auto line : lines) 780 for (auto line : lines)
745 { 781 {
746 ppgs.update(); 782 ppgs.update();
747 783
748 std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); 784 std::regex relation(
785 "^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
786
749 std::smatch relation_data; 787 std::smatch relation_data;
750 if (!std::regex_search(line, relation_data, relation)) 788 if (!std::regex_search(line, relation_data, relation))
751 { 789 {
752 continue; 790 continue;
753 } 791 }
754 792
755 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); 793 std::pair<int, int> lookup1(
756 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); 794 std::stoi(relation_data[1]),
795 std::stoi(relation_data[2]));
796
797 std::pair<int, int> lookup2(
798 std::stoi(relation_data[3]),
799 std::stoi(relation_data[4]));
800
757 std::string class_type = relation_data[5]; 801 std::string class_type = relation_data[5];
758 802
759 std::string table_name; 803 std::string table_name;
@@ -773,18 +817,30 @@ namespace verbly {
773 817
774 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) 818 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
775 { 819 {
776 std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { 820 auto& wordSet = wordsByWnid_.at(lookup1.first);
777 return w->getId(); 821
778 }); 822 std::transform(
823 std::begin(wordSet),
824 std::end(wordSet),
825 std::back_inserter(leftJoin),
826 [] (word* w) {
827 return w->getId();
828 });
779 } else if (wordByWnidAndWnum_.count(lookup1)) { 829 } else if (wordByWnidAndWnum_.count(lookup1)) {
780 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); 830 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
781 } 831 }
782 832
783 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) 833 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
784 { 834 {
785 std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { 835 auto& wordSet = wordsByWnid_.at(lookup2.first);
786 return w->getId(); 836
787 }); 837 std::transform(
838 std::begin(wordSet),
839 std::end(wordSet),
840 std::back_inserter(rightJoin),
841 [] (word* w) {
842 return w->getId();
843 });
788 } else if (wordByWnidAndWnum_.count(lookup2)) { 844 } else if (wordByWnidAndWnum_.count(lookup2)) {
789 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); 845 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
790 } 846 }
@@ -807,7 +863,7 @@ namespace verbly {
807 void generator::readWordNetCausality() 863 void generator::readWordNetCausality()
808 { 864 {
809 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); 865 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
810 progress ppgs("Writing causality...", lines.size()); 866 hatkirby::progress ppgs("Writing causality...", lines.size());
811 for (auto line : lines) 867 for (auto line : lines)
812 { 868 {
813 ppgs.update(); 869 ppgs.update();
@@ -840,7 +896,7 @@ namespace verbly {
840 void generator::readWordNetEntailment() 896 void generator::readWordNetEntailment()
841 { 897 {
842 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); 898 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
843 progress ppgs("Writing entailment...", lines.size()); 899 hatkirby::progress ppgs("Writing entailment...", lines.size());
844 for (auto line : lines) 900 for (auto line : lines)
845 { 901 {
846 ppgs.update(); 902 ppgs.update();
@@ -873,7 +929,7 @@ namespace verbly {
873 void generator::readWordNetHypernymy() 929 void generator::readWordNetHypernymy()
874 { 930 {
875 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); 931 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
876 progress ppgs("Writing hypernymy...", lines.size()); 932 hatkirby::progress ppgs("Writing hypernymy...", lines.size());
877 for (auto line : lines) 933 for (auto line : lines)
878 { 934 {
879 ppgs.update(); 935 ppgs.update();
@@ -906,7 +962,7 @@ namespace verbly {
906 void generator::readWordNetInstantiation() 962 void generator::readWordNetInstantiation()
907 { 963 {
908 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); 964 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
909 progress ppgs("Writing instantiation...", lines.size()); 965 hatkirby::progress ppgs("Writing instantiation...", lines.size());
910 for (auto line : lines) 966 for (auto line : lines)
911 { 967 {
912 ppgs.update(); 968 ppgs.update();
@@ -939,7 +995,7 @@ namespace verbly {
939 void generator::readWordNetMemberMeronymy() 995 void generator::readWordNetMemberMeronymy()
940 { 996 {
941 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); 997 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
942 progress ppgs("Writing member meronymy...", lines.size()); 998 hatkirby::progress ppgs("Writing member meronymy...", lines.size());
943 for (auto line : lines) 999 for (auto line : lines)
944 { 1000 {
945 ppgs.update(); 1001 ppgs.update();
@@ -972,7 +1028,7 @@ namespace verbly {
972 void generator::readWordNetPartMeronymy() 1028 void generator::readWordNetPartMeronymy()
973 { 1029 {
974 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); 1030 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
975 progress ppgs("Writing part meronymy...", lines.size()); 1031 hatkirby::progress ppgs("Writing part meronymy...", lines.size());
976 for (auto line : lines) 1032 for (auto line : lines)
977 { 1033 {
978 ppgs.update(); 1034 ppgs.update();
@@ -1005,7 +1061,7 @@ namespace verbly {
1005 void generator::readWordNetSubstanceMeronymy() 1061 void generator::readWordNetSubstanceMeronymy()
1006 { 1062 {
1007 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); 1063 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
1008 progress ppgs("Writing substance meronymy...", lines.size()); 1064 hatkirby::progress ppgs("Writing substance meronymy...", lines.size());
1009 for (auto line : lines) 1065 for (auto line : lines)
1010 { 1066 {
1011 ppgs.update(); 1067 ppgs.update();
@@ -1038,27 +1094,40 @@ namespace verbly {
1038 void generator::readWordNetPertainymy() 1094 void generator::readWordNetPertainymy()
1039 { 1095 {
1040 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); 1096 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
1041 progress ppgs("Writing pertainymy and mannernymy...", lines.size()); 1097
1098 hatkirby::progress ppgs(
1099 "Writing pertainymy and mannernymy...",
1100 lines.size());
1101
1042 for (auto line : lines) 1102 for (auto line : lines)
1043 { 1103 {
1044 ppgs.update(); 1104 ppgs.update();
1045 1105
1046 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); 1106 std::regex relation(
1107 "^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
1108
1047 std::smatch relation_data; 1109 std::smatch relation_data;
1048 if (!std::regex_search(line, relation_data, relation)) 1110 if (!std::regex_search(line, relation_data, relation))
1049 { 1111 {
1050 continue; 1112 continue;
1051 } 1113 }
1052 1114
1053 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); 1115 std::pair<int, int> lookup1(
1054 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); 1116 std::stoi(relation_data[1]),
1117 std::stoi(relation_data[2]));
1055 1118
1056 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) 1119 std::pair<int, int> lookup2(
1120 std::stoi(relation_data[3]),
1121 std::stoi(relation_data[4]));
1122
1123 if (wordByWnidAndWnum_.count(lookup1) &&
1124 wordByWnidAndWnum_.count(lookup2))
1057 { 1125 {
1058 word& word1 = *wordByWnidAndWnum_.at(lookup1); 1126 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1059 word& word2 = *wordByWnidAndWnum_.at(lookup2); 1127 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1060 1128
1061 if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) 1129 if (word1.getNotion().getPartOfSpeech() ==
1130 part_of_speech::adjective)
1062 { 1131 {
1063 db_.insertIntoTable( 1132 db_.insertIntoTable(
1064 "pertainymy", 1133 "pertainymy",
@@ -1066,7 +1135,8 @@ namespace verbly {
1066 { "pertainym_id", word1.getId() }, 1135 { "pertainym_id", word1.getId() },
1067 { "noun_id", word2.getId() } 1136 { "noun_id", word2.getId() }
1068 }); 1137 });
1069 } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) 1138 } else if (word1.getNotion().getPartOfSpeech() ==
1139 part_of_speech::adverb)
1070 { 1140 {
1071 db_.insertIntoTable( 1141 db_.insertIntoTable(
1072 "mannernymy", 1142 "mannernymy",
@@ -1082,7 +1152,7 @@ namespace verbly {
1082 void generator::readWordNetSpecification() 1152 void generator::readWordNetSpecification()
1083 { 1153 {
1084 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); 1154 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
1085 progress ppgs("Writing specifications...", lines.size()); 1155 hatkirby::progress ppgs("Writing specifications...", lines.size());
1086 for (auto line : lines) 1156 for (auto line : lines)
1087 { 1157 {
1088 ppgs.update(); 1158 ppgs.update();
@@ -1094,10 +1164,17 @@ namespace verbly {
1094 continue; 1164 continue;
1095 } 1165 }
1096 1166
1097 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); 1167 std::pair<int, int> lookup1(
1098 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); 1168 std::stoi(relation_data[1]),
1169 std::stoi(relation_data[2]));
1170
1171 std::pair<int, int> lookup2(
1172 std::stoi(relation_data[3]),
1173 std::stoi(relation_data[4]));
1099 1174
1100 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) 1175
1176 if (wordByWnidAndWnum_.count(lookup1) &&
1177 wordByWnidAndWnum_.count(lookup2))
1101 { 1178 {
1102 word& word1 = *wordByWnidAndWnum_.at(lookup1); 1179 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1103 word& word2 = *wordByWnidAndWnum_.at(lookup2); 1180 word& word2 = *wordByWnidAndWnum_.at(lookup2);
@@ -1115,7 +1192,7 @@ namespace verbly {
1115 void generator::readWordNetSimilarity() 1192 void generator::readWordNetSimilarity()
1116 { 1193 {
1117 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); 1194 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
1118 progress ppgs("Writing adjective similarity...", lines.size()); 1195 hatkirby::progress ppgs("Writing adjective similarity...", lines.size());
1119 for (auto line : lines) 1196 for (auto line : lines)
1120 { 1197 {
1121 ppgs.update(); 1198 ppgs.update();
@@ -1149,7 +1226,7 @@ namespace verbly {
1149 { 1226 {
1150 std::cout << "Analyzing data..." << std::endl; 1227 std::cout << "Analyzing data..." << std::endl;
1151 1228
1152 db_.runQuery("ANALYZE"); 1229 db_.execute("ANALYZE");
1153 } 1230 }
1154 1231
1155 std::list<std::string> generator::readFile(std::string path) 1232 std::list<std::string> generator::readFile(std::string path)
@@ -1183,7 +1260,8 @@ namespace verbly {
1183 case 2: return part_of_speech::verb; 1260 case 2: return part_of_speech::verb;
1184 case 3: return part_of_speech::adjective; 1261 case 3: return part_of_speech::adjective;
1185 case 4: return part_of_speech::adverb; 1262 case 4: return part_of_speech::adverb;
1186 default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); 1263 default: throw std::domain_error(
1264 "Invalid WordNet synset ID: " + std::to_string(wnid));
1187 } 1265 }
1188 } 1266 }
1189 1267
@@ -1296,20 +1374,30 @@ namespace verbly {
1296 std::string wnSenses(reinterpret_cast<const char*>(key)); 1374 std::string wnSenses(reinterpret_cast<const char*>(key));
1297 xmlFree(key); 1375 xmlFree(key);
1298 1376
1299 auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); 1377 auto wnSenseKeys =
1378 hatkirby::split<std::list<std::string>>(wnSenses, " ");
1379
1300 if (!wnSenseKeys.empty()) 1380 if (!wnSenseKeys.empty())
1301 { 1381 {
1302 std::list<std::string> tempKeys; 1382 std::list<std::string> tempKeys;
1303 1383
1304 std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { 1384 std::transform(
1305 return sense + "::"; 1385 std::begin(wnSenseKeys),
1306 }); 1386 std::end(wnSenseKeys),
1387 std::back_inserter(tempKeys),
1388 [] (std::string sense) {
1389 return sense + "::";
1390 });
1307 1391
1308 std::list<std::string> filteredKeys; 1392 std::list<std::string> filteredKeys;
1309 1393
1310 std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { 1394 std::remove_copy_if(
1311 return !wnSenseKeys_.count(sense); 1395 std::begin(tempKeys),
1312 }); 1396 std::end(tempKeys),
1397 std::back_inserter(filteredKeys),
1398 [&] (std::string sense) {
1399 return !wnSenseKeys_.count(sense);
1400 });
1313 1401
1314 wnSenseKeys = std::move(filteredKeys); 1402 wnSenseKeys = std::move(filteredKeys);
1315 } 1403 }
@@ -1431,10 +1519,15 @@ namespace verbly {
1431 std::string choicesStr = reinterpret_cast<const char*>(key); 1519 std::string choicesStr = reinterpret_cast<const char*>(key);
1432 xmlFree(key); 1520 xmlFree(key);
1433 1521
1434 for (std::string choice : split<std::list<std::string>>(choicesStr, " ")) 1522 auto choices =
1523 hatkirby::split<std::list<std::string>>(
1524 choicesStr, " ");
1525
1526 for (std::string choice : choices)
1435 { 1527 {
1436 int chloc; 1528 int chloc;
1437 while ((chloc = choice.find_first_of("_")) != std::string::npos) 1529 while ((chloc = choice.find_first_of("_"))
1530 != std::string::npos)
1438 { 1531 {
1439 choice.replace(chloc, 1, " "); 1532 choice.replace(chloc, 1, " ");
1440 } 1533 }
@@ -1444,7 +1537,9 @@ namespace verbly {
1444 } else { 1537 } else {
1445 partLiteral = false; 1538 partLiteral = false;
1446 1539
1447 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) 1540 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode;
1541 npnode != nullptr;
1542 npnode = npnode->next)
1448 { 1543 {
1449 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) 1544 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1450 { 1545 {