diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-04-15 17:24:44 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-04-15 17:24:44 -0400 |
commit | 040ee58fecdc9c478004bc2e554e1ae126ec4602 (patch) | |
tree | 672a75690952ba8055ab9765ba0a475e056e35d4 /generator/generator.cpp | |
parent | 3a225f5eb709262b9d44d49519136ea9a2a71000 (diff) | |
download | verbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.tar.gz verbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.tar.bz2 verbly-040ee58fecdc9c478004bc2e554e1ae126ec4602.zip |
Added support for ImageNet and fixed bug with query interface
Datafile change: nouns now know how many images are associated with them on ImageNet, and also have their WordNet synset ID saved so that you can query for images of that noun via the ImageNet API. So far, verbly only exposes the ImageNet API URL, and doesn't actually interact with it itself. This may be changed in the future. The query interface had a huge issue in which multiple instances of the same condition would overwrite each other. This has been fixed.
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 57 |
1 files changed, 45 insertions, 12 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6fbbfb8..e67bda7 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -81,6 +81,7 @@ std::map<std::string, verb_t> verbs; | |||
81 | std::map<std::string, adjective_t> adjectives; | 81 | std::map<std::string, adjective_t> adjectives; |
82 | std::map<std::string, noun_t> nouns; | 82 | std::map<std::string, noun_t> nouns; |
83 | std::map<int, std::map<int, int>> wn; | 83 | std::map<int, std::map<int, int>> wn; |
84 | std::map<int, int> images; | ||
84 | std::map<std::string, std::set<std::string>> pronunciations; | 85 | std::map<std::string, std::set<std::string>> pronunciations; |
85 | 86 | ||
86 | void print_usage() | 87 | void print_usage() |
@@ -89,10 +90,10 @@ void print_usage() | |||
89 | std::cout << "-------------------------" << std::endl; | 90 | std::cout << "-------------------------" << std::endl; |
90 | std::cout << "Requires exactly six arguments." << std::endl; | 91 | std::cout << "Requires exactly six arguments." << std::endl; |
91 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | 92 | std::cout << "1. The path to a VerbNet data directory." << std::endl; |
92 | std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; | 93 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; |
93 | std::cout << "3. The path to an AGID infl.txt file." << std::endl; | 94 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; |
94 | std::cout << "4. The path to a WordNet prolog data directory." << std::endl; | 95 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; |
95 | std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; | 96 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; |
96 | std::cout << "6. Datafile output path." << std::endl; | 97 | std::cout << "6. Datafile output path." << std::endl; |
97 | 98 | ||
98 | exit(1); | 99 | exit(1); |
@@ -431,10 +432,10 @@ int main(int argc, char** argv) | |||
431 | // Get verbs from AGID | 432 | // Get verbs from AGID |
432 | std::cout << "Reading inflections..." << std::endl; | 433 | std::cout << "Reading inflections..." << std::endl; |
433 | 434 | ||
434 | std::ifstream agidfile(argv[3]); | 435 | std::ifstream agidfile(argv[2]); |
435 | if (!agidfile.is_open()) | 436 | if (!agidfile.is_open()) |
436 | { | 437 | { |
437 | std::cout << "Could not open AGID file: " << argv[3] << std::endl; | 438 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; |
438 | print_usage(); | 439 | print_usage(); |
439 | } | 440 | } |
440 | 441 | ||
@@ -562,10 +563,10 @@ int main(int argc, char** argv) | |||
562 | // Pronounciations | 563 | // Pronounciations |
563 | std::cout << "Reading pronunciations..." << std::endl; | 564 | std::cout << "Reading pronunciations..." << std::endl; |
564 | 565 | ||
565 | std::ifstream pronfile(argv[5]); | 566 | std::ifstream pronfile(argv[4]); |
566 | if (!pronfile.is_open()) | 567 | if (!pronfile.is_open()) |
567 | { | 568 | { |
568 | std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; | 569 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; |
569 | print_usage(); | 570 | print_usage(); |
570 | } | 571 | } |
571 | 572 | ||
@@ -593,6 +594,36 @@ int main(int argc, char** argv) | |||
593 | } | 594 | } |
594 | } | 595 | } |
595 | 596 | ||
597 | // Images | ||
598 | std::cout << "Reading images..." << std::endl; | ||
599 | |||
600 | std::ifstream imagefile(argv[5]); | ||
601 | if (!imagefile.is_open()) | ||
602 | { | ||
603 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
604 | print_usage(); | ||
605 | } | ||
606 | |||
607 | for (;;) | ||
608 | { | ||
609 | std::string line; | ||
610 | if (!getline(imagefile, line)) | ||
611 | { | ||
612 | break; | ||
613 | } | ||
614 | |||
615 | if (line.back() == '\r') | ||
616 | { | ||
617 | line.pop_back(); | ||
618 | } | ||
619 | |||
620 | std::string wnid_s = line.substr(1, 8); | ||
621 | int wnid = stoi(wnid_s) + 100000000; | ||
622 | images[wnid]++; | ||
623 | } | ||
624 | |||
625 | imagefile.close(); | ||
626 | |||
596 | // Start writing output | 627 | // Start writing output |
597 | std::cout << "Writing schema..." << std::endl; | 628 | std::cout << "Writing schema..." << std::endl; |
598 | 629 | ||
@@ -972,7 +1003,7 @@ int main(int argc, char** argv) | |||
972 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | 1003 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) |
973 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | 1004 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) |
974 | // - syntax: positioning flags for some adjectives | 1005 | // - syntax: positioning flags for some adjectives |
975 | std::string wnpref {argv[4]}; | 1006 | std::string wnpref {argv[3]}; |
976 | if (wnpref.back() != '/') | 1007 | if (wnpref.back() != '/') |
977 | { | 1008 | { |
978 | wnpref += '/'; | 1009 | wnpref += '/'; |
@@ -1027,9 +1058,9 @@ int main(int argc, char** argv) | |||
1027 | { | 1058 | { |
1028 | if (nouns.count(word) == 1) | 1059 | if (nouns.count(word) == 1) |
1029 | { | 1060 | { |
1030 | query = "INSERT INTO nouns (singular, proper, complexity, plural) VALUES (?, ?, ?, ?)"; | 1061 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; |
1031 | } else { | 1062 | } else { |
1032 | query = "INSERT INTO nouns (singular, proper, complexity) VALUES (?, ?, ?)"; | 1063 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; |
1033 | } | 1064 | } |
1034 | 1065 | ||
1035 | break; | 1066 | break; |
@@ -1083,10 +1114,12 @@ int main(int argc, char** argv) | |||
1083 | }) ? 1 : 0)); | 1114 | }) ? 1 : 0)); |
1084 | 1115 | ||
1085 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | 1116 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); |
1117 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
1118 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
1086 | 1119 | ||
1087 | if (nouns.count(word) == 1) | 1120 | if (nouns.count(word) == 1) |
1088 | { | 1121 | { |
1089 | sqlite3_bind_text(ppstmt, 4, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); | 1122 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); |
1090 | } | 1123 | } |
1091 | 1124 | ||
1092 | break; | 1125 | break; |