diff options
Diffstat (limited to 'generator')
-rw-r--r-- | generator/generator.cpp | 57 | ||||
-rw-r--r-- | generator/schema.sql | 4 |
2 files changed, 48 insertions, 13 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6fbbfb8..e67bda7 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -81,6 +81,7 @@ std::map<std::string, verb_t> verbs; | |||
81 | std::map<std::string, adjective_t> adjectives; | 81 | std::map<std::string, adjective_t> adjectives; |
82 | std::map<std::string, noun_t> nouns; | 82 | std::map<std::string, noun_t> nouns; |
83 | std::map<int, std::map<int, int>> wn; | 83 | std::map<int, std::map<int, int>> wn; |
84 | std::map<int, int> images; | ||
84 | std::map<std::string, std::set<std::string>> pronunciations; | 85 | std::map<std::string, std::set<std::string>> pronunciations; |
85 | 86 | ||
86 | void print_usage() | 87 | void print_usage() |
@@ -89,10 +90,10 @@ void print_usage() | |||
89 | std::cout << "-------------------------" << std::endl; | 90 | std::cout << "-------------------------" << std::endl; |
90 | std::cout << "Requires exactly six arguments." << std::endl; | 91 | std::cout << "Requires exactly six arguments." << std::endl; |
91 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | 92 | std::cout << "1. The path to a VerbNet data directory." << std::endl; |
92 | std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; | 93 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; |
93 | std::cout << "3. The path to an AGID infl.txt file." << std::endl; | 94 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; |
94 | std::cout << "4. The path to a WordNet prolog data directory." << std::endl; | 95 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; |
95 | std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; | 96 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; |
96 | std::cout << "6. Datafile output path." << std::endl; | 97 | std::cout << "6. Datafile output path." << std::endl; |
97 | 98 | ||
98 | exit(1); | 99 | exit(1); |
@@ -431,10 +432,10 @@ int main(int argc, char** argv) | |||
431 | // Get verbs from AGID | 432 | // Get verbs from AGID |
432 | std::cout << "Reading inflections..." << std::endl; | 433 | std::cout << "Reading inflections..." << std::endl; |
433 | 434 | ||
434 | std::ifstream agidfile(argv[3]); | 435 | std::ifstream agidfile(argv[2]); |
435 | if (!agidfile.is_open()) | 436 | if (!agidfile.is_open()) |
436 | { | 437 | { |
437 | std::cout << "Could not open AGID file: " << argv[3] << std::endl; | 438 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; |
438 | print_usage(); | 439 | print_usage(); |
439 | } | 440 | } |
440 | 441 | ||
@@ -562,10 +563,10 @@ int main(int argc, char** argv) | |||
562 | // Pronounciations | 563 | // Pronounciations |
563 | std::cout << "Reading pronunciations..." << std::endl; | 564 | std::cout << "Reading pronunciations..." << std::endl; |
564 | 565 | ||
565 | std::ifstream pronfile(argv[5]); | 566 | std::ifstream pronfile(argv[4]); |
566 | if (!pronfile.is_open()) | 567 | if (!pronfile.is_open()) |
567 | { | 568 | { |
568 | std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; | 569 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; |
569 | print_usage(); | 570 | print_usage(); |
570 | } | 571 | } |
571 | 572 | ||
@@ -593,6 +594,36 @@ int main(int argc, char** argv) | |||
593 | } | 594 | } |
594 | } | 595 | } |
595 | 596 | ||
597 | // Images | ||
598 | std::cout << "Reading images..." << std::endl; | ||
599 | |||
600 | std::ifstream imagefile(argv[5]); | ||
601 | if (!imagefile.is_open()) | ||
602 | { | ||
603 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
604 | print_usage(); | ||
605 | } | ||
606 | |||
607 | for (;;) | ||
608 | { | ||
609 | std::string line; | ||
610 | if (!getline(imagefile, line)) | ||
611 | { | ||
612 | break; | ||
613 | } | ||
614 | |||
615 | if (line.back() == '\r') | ||
616 | { | ||
617 | line.pop_back(); | ||
618 | } | ||
619 | |||
620 | std::string wnid_s = line.substr(1, 8); | ||
621 | int wnid = stoi(wnid_s) + 100000000; | ||
622 | images[wnid]++; | ||
623 | } | ||
624 | |||
625 | imagefile.close(); | ||
626 | |||
596 | // Start writing output | 627 | // Start writing output |
597 | std::cout << "Writing schema..." << std::endl; | 628 | std::cout << "Writing schema..." << std::endl; |
598 | 629 | ||
@@ -972,7 +1003,7 @@ int main(int argc, char** argv) | |||
972 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | 1003 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) |
973 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | 1004 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) |
974 | // - syntax: positioning flags for some adjectives | 1005 | // - syntax: positioning flags for some adjectives |
975 | std::string wnpref {argv[4]}; | 1006 | std::string wnpref {argv[3]}; |
976 | if (wnpref.back() != '/') | 1007 | if (wnpref.back() != '/') |
977 | { | 1008 | { |
978 | wnpref += '/'; | 1009 | wnpref += '/'; |
@@ -1027,9 +1058,9 @@ int main(int argc, char** argv) | |||
1027 | { | 1058 | { |
1028 | if (nouns.count(word) == 1) | 1059 | if (nouns.count(word) == 1) |
1029 | { | 1060 | { |
1030 | query = "INSERT INTO nouns (singular, proper, complexity, plural) VALUES (?, ?, ?, ?)"; | 1061 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; |
1031 | } else { | 1062 | } else { |
1032 | query = "INSERT INTO nouns (singular, proper, complexity) VALUES (?, ?, ?)"; | 1063 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; |
1033 | } | 1064 | } |
1034 | 1065 | ||
1035 | break; | 1066 | break; |
@@ -1083,10 +1114,12 @@ int main(int argc, char** argv) | |||
1083 | }) ? 1 : 0)); | 1114 | }) ? 1 : 0)); |
1084 | 1115 | ||
1085 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | 1116 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); |
1117 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
1118 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
1086 | 1119 | ||
1087 | if (nouns.count(word) == 1) | 1120 | if (nouns.count(word) == 1) |
1088 | { | 1121 | { |
1089 | sqlite3_bind_text(ppstmt, 4, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); | 1122 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); |
1090 | } | 1123 | } |
1091 | 1124 | ||
1092 | break; | 1125 | break; |
diff --git a/generator/schema.sql b/generator/schema.sql index f2445f0..9a39944 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
@@ -55,7 +55,9 @@ CREATE TABLE `nouns` ( | |||
55 | `singular` VARCHAR(32) NOT NULL, | 55 | `singular` VARCHAR(32) NOT NULL, |
56 | `plural` VARCHAR(32), | 56 | `plural` VARCHAR(32), |
57 | `proper` INTEGER(1) NOT NULL, | 57 | `proper` INTEGER(1) NOT NULL, |
58 | `complexity` INTEGER NOT NULL | 58 | `complexity` INTEGER NOT NULL, |
59 | `images` INTEGER NOT NULL, | ||
60 | `wnid` INTEGER NOT NULL | ||
59 | ); | 61 | ); |
60 | 62 | ||
61 | DROP TABLE IF EXISTS `hypernymy`; | 63 | DROP TABLE IF EXISTS `hypernymy`; |