summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp190
1 files changed, 156 insertions, 34 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6fbbfb8..3201154 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -76,12 +76,24 @@ struct group_t {
76 std::list<std::list<framepart_t>> frames; 76 std::list<std::list<framepart_t>> frames;
77}; 77};
78 78
79struct pronunciation_t {
80 std::string phonemes;
81 std::string prerhyme;
82 std::string rhyme;
83
84 bool operator<(const pronunciation_t& other) const
85 {
86 return phonemes < other.phonemes;
87 }
88};
89
79std::map<std::string, group_t> groups; 90std::map<std::string, group_t> groups;
80std::map<std::string, verb_t> verbs; 91std::map<std::string, verb_t> verbs;
81std::map<std::string, adjective_t> adjectives; 92std::map<std::string, adjective_t> adjectives;
82std::map<std::string, noun_t> nouns; 93std::map<std::string, noun_t> nouns;
83std::map<int, std::map<int, int>> wn; 94std::map<int, std::map<int, int>> wn;
84std::map<std::string, std::set<std::string>> pronunciations; 95std::map<int, int> images;
96std::map<std::string, std::set<pronunciation_t>> pronunciations;
85 97
86void print_usage() 98void print_usage()
87{ 99{
@@ -89,10 +101,10 @@ void print_usage()
89 std::cout << "-------------------------" << std::endl; 101 std::cout << "-------------------------" << std::endl;
90 std::cout << "Requires exactly six arguments." << std::endl; 102 std::cout << "Requires exactly six arguments." << std::endl;
91 std::cout << "1. The path to a VerbNet data directory." << std::endl; 103 std::cout << "1. The path to a VerbNet data directory." << std::endl;
92 std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; 104 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
93 std::cout << "3. The path to an AGID infl.txt file." << std::endl; 105 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
94 std::cout << "4. The path to a WordNet prolog data directory." << std::endl; 106 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
95 std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; 107 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
96 std::cout << "6. Datafile output path." << std::endl; 108 std::cout << "6. Datafile output path." << std::endl;
97 109
98 exit(1); 110 exit(1);
@@ -431,10 +443,10 @@ int main(int argc, char** argv)
431 // Get verbs from AGID 443 // Get verbs from AGID
432 std::cout << "Reading inflections..." << std::endl; 444 std::cout << "Reading inflections..." << std::endl;
433 445
434 std::ifstream agidfile(argv[3]); 446 std::ifstream agidfile(argv[2]);
435 if (!agidfile.is_open()) 447 if (!agidfile.is_open())
436 { 448 {
437 std::cout << "Could not open AGID file: " << argv[3] << std::endl; 449 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
438 print_usage(); 450 print_usage();
439 } 451 }
440 452
@@ -562,10 +574,10 @@ int main(int argc, char** argv)
562 // Pronounciations 574 // Pronounciations
563 std::cout << "Reading pronunciations..." << std::endl; 575 std::cout << "Reading pronunciations..." << std::endl;
564 576
565 std::ifstream pronfile(argv[5]); 577 std::ifstream pronfile(argv[4]);
566 if (!pronfile.is_open()) 578 if (!pronfile.is_open())
567 { 579 {
568 std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; 580 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
569 print_usage(); 581 print_usage();
570 } 582 }
571 583
@@ -589,10 +601,80 @@ int main(int argc, char** argv)
589 std::string canonical(phoneme_data[1]); 601 std::string canonical(phoneme_data[1]);
590 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); 602 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
591 603
592 pronunciations[canonical].insert(phoneme_data[2]); 604 std::string phonemes = phoneme_data[2];
605 auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " ");
606 auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) {
607 return phoneme.find("1") != std::string::npos;
608 });
609
610 pronunciation_t p;
611 p.phonemes = phonemes;
612 if (phemstrt != std::end(phoneme_set))
613 {
614 std::stringstream rhymer;
615 for (auto it = phemstrt; it != std::end(phoneme_set); it++)
616 {
617 std::string naked;
618 std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) {
619 return isdigit(ch);
620 });
621
622 if (it != phemstrt)
623 {
624 rhymer << " ";
625 }
626
627 rhymer << naked;
628 }
629
630 p.rhyme = rhymer.str();
631
632 if (phemstrt != std::begin(phoneme_set))
633 {
634 phemstrt--;
635 p.prerhyme = *phemstrt;
636 } else {
637 p.prerhyme = "";
638 }
639 } else {
640 p.prerhyme = "";
641 p.rhyme = "";
642 }
643
644 pronunciations[canonical].insert(p);
645 }
646 }
647
648 // Images
649 std::cout << "Reading images..." << std::endl;
650
651 std::ifstream imagefile(argv[5]);
652 if (!imagefile.is_open())
653 {
654 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
655 print_usage();
656 }
657
658 for (;;)
659 {
660 std::string line;
661 if (!getline(imagefile, line))
662 {
663 break;
593 } 664 }
665
666 if (line.back() == '\r')
667 {
668 line.pop_back();
669 }
670
671 std::string wnid_s = line.substr(1, 8);
672 int wnid = stoi(wnid_s) + 100000000;
673 images[wnid]++;
594 } 674 }
595 675
676 imagefile.close();
677
596 // Start writing output 678 // Start writing output
597 std::cout << "Writing schema..." << std::endl; 679 std::cout << "Writing schema..." << std::endl;
598 680
@@ -689,7 +771,7 @@ int main(int argc, char** argv)
689 db_error(ppdb, query); 771 db_error(ppdb, query);
690 } 772 }
691 773
692 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_STATIC); 774 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT);
693 775
694 if (sqlite3_step(ppstmt) != SQLITE_DONE) 776 if (sqlite3_step(ppstmt) != SQLITE_DONE)
695 { 777 {
@@ -721,7 +803,7 @@ int main(int argc, char** argv)
721 } 803 }
722 804
723 sqlite3_bind_int(ppstmt, 1, rowid); 805 sqlite3_bind_int(ppstmt, 1, rowid);
724 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_STATIC); 806 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT);
725 807
726 if (sqlite3_step(ppstmt) != SQLITE_DONE) 808 if (sqlite3_step(ppstmt) != SQLITE_DONE)
727 { 809 {
@@ -744,11 +826,11 @@ int main(int argc, char** argv)
744 db_error(ppdb, query); 826 db_error(ppdb, query);
745 } 827 }
746 828
747 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); 829 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT);
748 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); 830 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT);
749 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); 831 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT);
750 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); 832 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT);
751 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); 833 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT);
752 834
753 if (sqlite3_step(ppstmt) != SQLITE_DONE) 835 if (sqlite3_step(ppstmt) != SQLITE_DONE)
754 { 836 {
@@ -780,14 +862,26 @@ int main(int argc, char** argv)
780 862
781 for (auto pronunciation : pronunciations[canonical]) 863 for (auto pronunciation : pronunciations[canonical])
782 { 864 {
783 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; 865 if (!pronunciation.rhyme.empty())
866 {
867 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, prerhyme, rhyme) VALUES (?, ?, ?, ?)";
868 } else {
869 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)";
870 }
871
784 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) 872 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
785 { 873 {
786 db_error(ppdb, query); 874 db_error(ppdb, query);
787 } 875 }
788 876
789 sqlite3_bind_int(ppstmt, 1, rowid); 877 sqlite3_bind_int(ppstmt, 1, rowid);
790 sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); 878 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
879
880 if (!pronunciation.rhyme.empty())
881 {
882 sqlite3_bind_text(ppstmt, 3, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
883 sqlite3_bind_text(ppstmt, 4, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
884 }
791 885
792 if (sqlite3_step(ppstmt) != SQLITE_DONE) 886 if (sqlite3_step(ppstmt) != SQLITE_DONE)
793 { 887 {
@@ -825,7 +919,7 @@ int main(int argc, char** argv)
825 db_error(ppdb, query); 919 db_error(ppdb, query);
826 } 920 }
827 921
828 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_STATIC); 922 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT);
829 923
830 if (sqlite3_step(ppstmt) != SQLITE_DONE) 924 if (sqlite3_step(ppstmt) != SQLITE_DONE)
831 { 925 {
@@ -918,7 +1012,7 @@ int main(int argc, char** argv)
918 } 1012 }
919 1013
920 sqlite3_bind_int(ppstmt, 1, gid); 1014 sqlite3_bind_int(ppstmt, 1, gid);
921 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_STATIC); 1015 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT);
922 1016
923 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1017 if (sqlite3_step(ppstmt) != SQLITE_DONE)
924 { 1018 {
@@ -972,7 +1066,7 @@ int main(int argc, char** argv)
972 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) 1066 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
973 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) 1067 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
974 // - syntax: positioning flags for some adjectives 1068 // - syntax: positioning flags for some adjectives
975 std::string wnpref {argv[4]}; 1069 std::string wnpref {argv[3]};
976 if (wnpref.back() != '/') 1070 if (wnpref.back() != '/')
977 { 1071 {
978 wnpref += '/'; 1072 wnpref += '/';
@@ -1009,7 +1103,7 @@ int main(int argc, char** argv)
1009 { 1103 {
1010 ppgs.update(); 1104 ppgs.update();
1011 1105
1012 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); 1106 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$");
1013 std::smatch relation_data; 1107 std::smatch relation_data;
1014 if (!std::regex_search(line, relation_data, relation)) 1108 if (!std::regex_search(line, relation_data, relation))
1015 { 1109 {
@@ -1019,6 +1113,11 @@ int main(int argc, char** argv)
1019 int synset_id = stoi(relation_data[1]); 1113 int synset_id = stoi(relation_data[1]);
1020 int wnum = stoi(relation_data[2]); 1114 int wnum = stoi(relation_data[2]);
1021 std::string word = relation_data[3]; 1115 std::string word = relation_data[3];
1116 size_t word_it;
1117 while ((word_it = word.find("''")) != std::string::npos)
1118 {
1119 word.erase(word_it, 1);
1120 }
1022 1121
1023 std::string query; 1122 std::string query;
1024 switch (synset_id / 100000000) 1123 switch (synset_id / 100000000)
@@ -1027,9 +1126,9 @@ int main(int argc, char** argv)
1027 { 1126 {
1028 if (nouns.count(word) == 1) 1127 if (nouns.count(word) == 1)
1029 { 1128 {
1030 query = "INSERT INTO nouns (singular, proper, complexity, plural) VALUES (?, ?, ?, ?)"; 1129 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1031 } else { 1130 } else {
1032 query = "INSERT INTO nouns (singular, proper, complexity) VALUES (?, ?, ?)"; 1131 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1033 } 1132 }
1034 1133
1035 break; 1134 break;
@@ -1073,7 +1172,7 @@ int main(int argc, char** argv)
1073 db_error(ppdb, query); 1172 db_error(ppdb, query);
1074 } 1173 }
1075 1174
1076 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); 1175 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT);
1077 switch (synset_id / 100000000) 1176 switch (synset_id / 100000000)
1078 { 1177 {
1079 case 1: // Noun 1178 case 1: // Noun
@@ -1083,10 +1182,12 @@ int main(int argc, char** argv)
1083 }) ? 1 : 0)); 1182 }) ? 1 : 0));
1084 1183
1085 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); 1184 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1185 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1186 sqlite3_bind_int(ppstmt, 5, synset_id);
1086 1187
1087 if (nouns.count(word) == 1) 1188 if (nouns.count(word) == 1)
1088 { 1189 {
1089 sqlite3_bind_text(ppstmt, 4, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); 1190 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT);
1090 } 1191 }
1091 1192
1092 break; 1193 break;
@@ -1099,8 +1200,8 @@ int main(int argc, char** argv)
1099 1200
1100 if (adjectives.count(word) == 1) 1201 if (adjectives.count(word) == 1)
1101 { 1202 {
1102 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_STATIC); 1203 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT);
1103 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_STATIC); 1204 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT);
1104 } 1205 }
1105 1206
1106 break; 1207 break;
@@ -1140,21 +1241,36 @@ int main(int argc, char** argv)
1140 { 1241 {
1141 case 1: // Noun 1242 case 1: // Noun
1142 { 1243 {
1143 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)"; 1244 if (!pronunciation.rhyme.empty())
1245 {
1246 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, prerhyme, rhyme) VALUES (?, ?, ?, ?)";
1247 } else {
1248 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)";
1249 }
1144 1250
1145 break; 1251 break;
1146 } 1252 }
1147 1253
1148 case 3: // Adjective 1254 case 3: // Adjective
1149 { 1255 {
1150 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)"; 1256 if (!pronunciation.rhyme.empty())
1257 {
1258 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, prerhyme, rhyme) VALUES (?, ?, ?, ?)";
1259 } else {
1260 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)";
1261 }
1151 1262
1152 break; 1263 break;
1153 } 1264 }
1154 1265
1155 case 4: // Adverb 1266 case 4: // Adverb
1156 { 1267 {
1157 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)"; 1268 if (!pronunciation.rhyme.empty())
1269 {
1270 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, prerhyme, rhyme) VALUES (?, ?, ?, ?)";
1271 } else {
1272 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)";
1273 }
1158 1274
1159 break; 1275 break;
1160 } 1276 }
@@ -1166,7 +1282,13 @@ int main(int argc, char** argv)
1166 } 1282 }
1167 1283
1168 sqlite3_bind_int(ppstmt, 1, rowid); 1284 sqlite3_bind_int(ppstmt, 1, rowid);
1169 sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); 1285 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
1286
1287 if (!pronunciation.rhyme.empty())
1288 {
1289 sqlite3_bind_text(ppstmt, 3, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
1290 sqlite3_bind_text(ppstmt, 4, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
1291 }
1170 1292
1171 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1293 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1172 { 1294 {
@@ -2155,7 +2277,7 @@ int main(int argc, char** argv)
2155 db_error(ppdb, query); 2277 db_error(ppdb, query);
2156 } 2278 }
2157 2279
2158 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_STATIC); 2280 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT);
2159 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); 2281 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]);
2160 2282
2161 if (sqlite3_step(ppstmt) != SQLITE_DONE) 2283 if (sqlite3_step(ppstmt) != SQLITE_DONE)