summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp57
1 files changed, 45 insertions, 12 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6fbbfb8..e67bda7 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -81,6 +81,7 @@ std::map<std::string, verb_t> verbs;
81std::map<std::string, adjective_t> adjectives; 81std::map<std::string, adjective_t> adjectives;
82std::map<std::string, noun_t> nouns; 82std::map<std::string, noun_t> nouns;
83std::map<int, std::map<int, int>> wn; 83std::map<int, std::map<int, int>> wn;
84std::map<int, int> images;
84std::map<std::string, std::set<std::string>> pronunciations; 85std::map<std::string, std::set<std::string>> pronunciations;
85 86
86void print_usage() 87void print_usage()
@@ -89,10 +90,10 @@ void print_usage()
89 std::cout << "-------------------------" << std::endl; 90 std::cout << "-------------------------" << std::endl;
90 std::cout << "Requires exactly six arguments." << std::endl; 91 std::cout << "Requires exactly six arguments." << std::endl;
91 std::cout << "1. The path to a VerbNet data directory." << std::endl; 92 std::cout << "1. The path to a VerbNet data directory." << std::endl;
92 std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; 93 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
93 std::cout << "3. The path to an AGID infl.txt file." << std::endl; 94 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
94 std::cout << "4. The path to a WordNet prolog data directory." << std::endl; 95 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
95 std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; 96 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
96 std::cout << "6. Datafile output path." << std::endl; 97 std::cout << "6. Datafile output path." << std::endl;
97 98
98 exit(1); 99 exit(1);
@@ -431,10 +432,10 @@ int main(int argc, char** argv)
431 // Get verbs from AGID 432 // Get verbs from AGID
432 std::cout << "Reading inflections..." << std::endl; 433 std::cout << "Reading inflections..." << std::endl;
433 434
434 std::ifstream agidfile(argv[3]); 435 std::ifstream agidfile(argv[2]);
435 if (!agidfile.is_open()) 436 if (!agidfile.is_open())
436 { 437 {
437 std::cout << "Could not open AGID file: " << argv[3] << std::endl; 438 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
438 print_usage(); 439 print_usage();
439 } 440 }
440 441
@@ -562,10 +563,10 @@ int main(int argc, char** argv)
562 // Pronounciations 563 // Pronounciations
563 std::cout << "Reading pronunciations..." << std::endl; 564 std::cout << "Reading pronunciations..." << std::endl;
564 565
565 std::ifstream pronfile(argv[5]); 566 std::ifstream pronfile(argv[4]);
566 if (!pronfile.is_open()) 567 if (!pronfile.is_open())
567 { 568 {
568 std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; 569 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
569 print_usage(); 570 print_usage();
570 } 571 }
571 572
@@ -593,6 +594,36 @@ int main(int argc, char** argv)
593 } 594 }
594 } 595 }
595 596
597 // Images
598 std::cout << "Reading images..." << std::endl;
599
600 std::ifstream imagefile(argv[5]);
601 if (!imagefile.is_open())
602 {
603 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
604 print_usage();
605 }
606
607 for (;;)
608 {
609 std::string line;
610 if (!getline(imagefile, line))
611 {
612 break;
613 }
614
615 if (line.back() == '\r')
616 {
617 line.pop_back();
618 }
619
620 std::string wnid_s = line.substr(1, 8);
621 int wnid = stoi(wnid_s) + 100000000;
622 images[wnid]++;
623 }
624
625 imagefile.close();
626
596 // Start writing output 627 // Start writing output
597 std::cout << "Writing schema..." << std::endl; 628 std::cout << "Writing schema..." << std::endl;
598 629
@@ -972,7 +1003,7 @@ int main(int argc, char** argv)
972 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) 1003 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
973 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) 1004 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
974 // - syntax: positioning flags for some adjectives 1005 // - syntax: positioning flags for some adjectives
975 std::string wnpref {argv[4]}; 1006 std::string wnpref {argv[3]};
976 if (wnpref.back() != '/') 1007 if (wnpref.back() != '/')
977 { 1008 {
978 wnpref += '/'; 1009 wnpref += '/';
@@ -1027,9 +1058,9 @@ int main(int argc, char** argv)
1027 { 1058 {
1028 if (nouns.count(word) == 1) 1059 if (nouns.count(word) == 1)
1029 { 1060 {
1030 query = "INSERT INTO nouns (singular, proper, complexity, plural) VALUES (?, ?, ?, ?)"; 1061 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1031 } else { 1062 } else {
1032 query = "INSERT INTO nouns (singular, proper, complexity) VALUES (?, ?, ?)"; 1063 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1033 } 1064 }
1034 1065
1035 break; 1066 break;
@@ -1083,10 +1114,12 @@ int main(int argc, char** argv)
1083 }) ? 1 : 0)); 1114 }) ? 1 : 0));
1084 1115
1085 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); 1116 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1117 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1118 sqlite3_bind_int(ppstmt, 5, synset_id);
1086 1119
1087 if (nouns.count(word) == 1) 1120 if (nouns.count(word) == 1)
1088 { 1121 {
1089 sqlite3_bind_text(ppstmt, 4, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); 1122 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC);
1090 } 1123 }
1091 1124
1092 break; 1125 break;