summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-04-15 17:24:44 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-04-15 17:24:44 -0400
commit040ee58fecdc9c478004bc2e554e1ae126ec4602 (patch)
tree672a75690952ba8055ab9765ba0a475e056e35d4 /generator/generator.cpp
parent3a225f5eb709262b9d44d49519136ea9a2a71000 (diff)
downloadverbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.tar.gz
verbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.tar.bz2
verbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.zip
Added support for ImageNet and fixed bug with query interface
Datafile change: nouns now know how many images are associated with them on ImageNet, and also have their WordNet synset ID saved so that you can query for images of that noun via the ImageNet API. So far, verbly only exposes the ImageNet API URL, and doesn't actually interact with it itself. This may be changed in the future.

The query interface had a huge issue in which multiple instances of the same condition would overwrite each other. This has been fixed.
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp57
1 files changed, 45 insertions, 12 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6fbbfb8..e67bda7 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -81,6 +81,7 @@ std::map<std::string, verb_t> verbs;
81std::map<std::string, adjective_t> adjectives; 81std::map<std::string, adjective_t> adjectives;
82std::map<std::string, noun_t> nouns; 82std::map<std::string, noun_t> nouns;
83std::map<int, std::map<int, int>> wn; 83std::map<int, std::map<int, int>> wn;
84std::map<int, int> images;
84std::map<std::string, std::set<std::string>> pronunciations; 85std::map<std::string, std::set<std::string>> pronunciations;
85 86
86void print_usage() 87void print_usage()
@@ -89,10 +90,10 @@ void print_usage()
89 std::cout << "-------------------------" << std::endl; 90 std::cout << "-------------------------" << std::endl;
90 std::cout << "Requires exactly six arguments." << std::endl; 91 std::cout << "Requires exactly six arguments." << std::endl;
91 std::cout << "1. The path to a VerbNet data directory." << std::endl; 92 std::cout << "1. The path to a VerbNet data directory." << std::endl;
92 std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; 93 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
93 std::cout << "3. The path to an AGID infl.txt file." << std::endl; 94 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
94 std::cout << "4. The path to a WordNet prolog data directory." << std::endl; 95 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
95 std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; 96 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
96 std::cout << "6. Datafile output path." << std::endl; 97 std::cout << "6. Datafile output path." << std::endl;
97 98
98 exit(1); 99 exit(1);
@@ -431,10 +432,10 @@ int main(int argc, char** argv)
431 // Get verbs from AGID 432 // Get verbs from AGID
432 std::cout << "Reading inflections..." << std::endl; 433 std::cout << "Reading inflections..." << std::endl;
433 434
434 std::ifstream agidfile(argv[3]); 435 std::ifstream agidfile(argv[2]);
435 if (!agidfile.is_open()) 436 if (!agidfile.is_open())
436 { 437 {
437 std::cout << "Could not open AGID file: " << argv[3] << std::endl; 438 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
438 print_usage(); 439 print_usage();
439 } 440 }
440 441
@@ -562,10 +563,10 @@ int main(int argc, char** argv)
562 // Pronounciations 563 // Pronounciations
563 std::cout << "Reading pronunciations..." << std::endl; 564 std::cout << "Reading pronunciations..." << std::endl;
564 565
565 std::ifstream pronfile(argv[5]); 566 std::ifstream pronfile(argv[4]);
566 if (!pronfile.is_open()) 567 if (!pronfile.is_open())
567 { 568 {
568 std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; 569 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
569 print_usage(); 570 print_usage();
570 } 571 }
571 572
@@ -593,6 +594,36 @@ int main(int argc, char** argv)
593 } 594 }
594 } 595 }
595 596
597 // Images
598 std::cout << "Reading images..." << std::endl;
599
600 std::ifstream imagefile(argv[5]);
601 if (!imagefile.is_open())
602 {
603 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
604 print_usage();
605 }
606
607 for (;;)
608 {
609 std::string line;
610 if (!getline(imagefile, line))
611 {
612 break;
613 }
614
615 if (line.back() == '\r')
616 {
617 line.pop_back();
618 }
619
620 std::string wnid_s = line.substr(1, 8);
621 int wnid = stoi(wnid_s) + 100000000;
622 images[wnid]++;
623 }
624
625 imagefile.close();
626
596 // Start writing output 627 // Start writing output
597 std::cout << "Writing schema..." << std::endl; 628 std::cout << "Writing schema..." << std::endl;
598 629
@@ -972,7 +1003,7 @@ int main(int argc, char** argv)
972 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) 1003 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
973 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) 1004 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
974 // - syntax: positioning flags for some adjectives 1005 // - syntax: positioning flags for some adjectives
975 std::string wnpref {argv[4]}; 1006 std::string wnpref {argv[3]};
976 if (wnpref.back() != '/') 1007 if (wnpref.back() != '/')
977 { 1008 {
978 wnpref += '/'; 1009 wnpref += '/';
@@ -1027,9 +1058,9 @@ int main(int argc, char** argv)
1027 { 1058 {
1028 if (nouns.count(word) == 1) 1059 if (nouns.count(word) == 1)
1029 { 1060 {
1030 query = "INSERT INTO nouns (singular, proper, complexity, plural) VALUES (?, ?, ?, ?)"; 1061 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1031 } else { 1062 } else {
1032 query = "INSERT INTO nouns (singular, proper, complexity) VALUES (?, ?, ?)"; 1063 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1033 } 1064 }
1034 1065
1035 break; 1066 break;
@@ -1083,10 +1114,12 @@ int main(int argc, char** argv)
1083 }) ? 1 : 0)); 1114 }) ? 1 : 0));
1084 1115
1085 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); 1116 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1117 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1118 sqlite3_bind_int(ppstmt, 5, synset_id);
1086 1119
1087 if (nouns.count(word) == 1) 1120 if (nouns.count(word) == 1)
1088 { 1121 {
1089 sqlite3_bind_text(ppstmt, 4, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); 1122 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC);
1090 } 1123 }
1091 1124
1092 break; 1125 break;