From e1be2716746e75cf6ed37e86461a7f580a964564 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Thu, 10 Mar 2016 21:34:55 -0500 Subject: Started implementing verbly data generator Currently, the generator: - Uses AGID to create entries for verb words and their inflections - Uses WordNet to create entries for adjective, adverb, and noun senses --- generator.cpp | 451 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 451 insertions(+) create mode 100644 generator.cpp (limited to 'generator.cpp') diff --git a/generator.cpp b/generator.cpp new file mode 100644 index 0000000..c389963 --- /dev/null +++ b/generator.cpp @@ -0,0 +1,451 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct verb { + std::string infinitive; + std::string past_tense; + std::string past_participle; + std::string ing_form; + std::string s_form; +}; + +struct group { + std::string id; + std::set members; +}; + +std::map groups; +std::map verbs; +std::map> wn; + +void print_usage() +{ + std::cout << "Verbly Datafile Generator" << std::endl; + std::cout << "-------------------------" << std::endl; + std::cout << "Requires exactly four arguments." << std::endl; + std::cout << "1. The path to a VerbNet data directory." << std::endl; + std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; + std::cout << "3. The path to an AGID infl.txt file." << std::endl; + std::cout << "4. The path to a WordNet prolog data directory." << std::endl; + std::cout << "5. Datafile output path." << std::endl; + + exit(1); +} +/* +void parse_group(xmlNodePtr top, std::string filename) +{ + xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); + if (key == 0) + { + std::cout << "Bad VerbNet file format: " << filename << std::endl; + print_usage(); + } + std::string vnid = key; + vnid = vnid.substr(vnid.find_first_of("-")+1); + xmlFree(key); + + group g; + g.id = vnid; + + for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) + { + if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) + { + for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) + { + if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) + { + key = xmlGetProp(member, (xmlChar*) "name"); + g.members.insert(key); + xmlFree(key); + } + } + } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) + { + for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) + { + if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) + { + for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) + { + + } + } + } + } + } +}*/ + +int main(int argc, char** argv) +{ + if (argc != 6) + { + print_usage(); + } + + /*DIR* dir; + if ((dir = opendir(argv[1])) == nullptr) + { + std::cout << "Invalid VerbNet data directory." << std::endl; + + print_usage(); + } + + struct dirent* ent; + while ((ent = readdir(dir)) != nullptr) + { + std::string filename(argv[1]); + if (filename.back() != '/') + { + filename += '/'; + } + + filename += ent->d_name; + //std::cout << ent->d_name << std::endl; + + if (filename.rfind(".xml") != filename.size() - 4) + { + continue; + } + + xmlDocPtr doc = xmlParseFile(filename.c_str()); + if (doc == nullptr) + { + std::cout << "Error opening " << filename << std::endl; + print_usage(); + } + + xmlNodePtr top = xmlDocGetRootElement(doc); + if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) + { + std::cout << "Bad VerbNet file format: " << filename << std::endl; + print_usage(); + } + + parse_group(top, filename); + } + + closedir(dir);*/ + + // Get verbs from AGID + std::cout << "Reading verb inflection..." << std::endl; + + std::ifstream agidfile(argv[3]); + if (!agidfile.is_open()) + { + std::cout << "Could not open AGID file: " << argv[3] << std::endl; + print_usage(); + } + + for (;;) + { + std::string line; + if (!getline(agidfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + int divider = line.find_first_of(" "); + std::string word = line.substr(0, divider); + line = line.substr(divider+1); + + if (line[0] != 'V') + { + continue; + } + + if (line[1] == '?') + { + line.erase(0, 4); + } else { + line.erase(0, 3); + } + + std::vector forms; + while (!line.empty()) + { + std::string inflection; + if ((divider = line.find(" | ")) != std::string::npos) + { + inflection = line.substr(0, divider); + line = line.substr(divider + 3); + } else { + inflection = line; + line = ""; + } + + if ((divider = inflection.find_first_of(",?")) != std::string::npos) + { + inflection = inflection.substr(0, divider); + } + + forms.push_back(inflection); + } + + verb v; + v.infinitive = word; + if (forms.size() == 4) + { + v.past_tense = forms[0]; + v.past_participle = forms[1]; + v.ing_form = forms[2]; + v.s_form = forms[3]; + } else if (forms.size() == 3) + { + v.past_tense = forms[0]; + v.past_participle = forms[0]; + v.ing_form = forms[1]; + v.s_form = forms[2]; + } else if (forms.size() == 8) + { + // As of AGID 2014.08.11, this is only "to be" + v.past_tense = forms[0]; + v.past_participle = forms[2]; + v.ing_form = forms[3]; + v.s_form = forms[4]; + } else { + // Words that don't fit the cases above as of AGID 2014.08.11: + // - may and shall do not conjugate the way we want them to + // - methinks only has a past tense and is an outlier + // - wit has five forms, and is archaic/obscure enough that we can ignore it for now + std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; + } + + verbs[word] = v; + } + + // Start writing output + std::cout << "Writing output..." << std::endl; + + sqlite3* ppdb; + if (sqlite3_open_v2(argv[5], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) + { + std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; + print_usage(); + } + + std::ifstream schemafile("schema.sql"); + if (!schemafile.is_open()) + { + std::cout << "Could not find schema file" << std::endl; + print_usage(); + } + + std::stringstream schemabuilder; + for (;;) + { + std::string line; + if (!getline(schemafile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + schemabuilder << line << std::endl; + } + + std::string schema = schemabuilder.str(); + while (!schema.empty()) + { + std::string query; + int divider = schema.find(";"); + if (divider != std::string::npos) + { + query = schema.substr(0, divider+1); + schema = schema.substr(divider+2); + } else { + break; + } + + sqlite3_stmt* schmstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + if (sqlite3_step(schmstmt) != SQLITE_DONE) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + sqlite3_finalize(schmstmt); + } + + std::cout << "Writing verbs..." << std::endl; + for (auto& mapping : verbs) + { + sqlite3_stmt* ppstmt; + std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + sqlite3_finalize(ppstmt); + } + + // Get nouns/adjectives/adverbs from WordNet + // Useful relations: + // - s: master list + // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) + // - at: variation (e.g. a measurement can be standard or nonstandard) + // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) + // - ins: instantiation (do we need this? let's see) + // - mm: member meronymy/holonymy (e.g. family/mother, family/child) + // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) + // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) + // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) + // mannernymy (e.g. something done quickly is done in a manner that is quick) + // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) + // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) + // - syntax: positioning flags for some adjectives + std::string wnpref {argv[4]}; + if (wnpref.back() != '/') + { + wnpref += '/'; + } + + std::cout << "Reading words from WordNet..." << std::endl; + std::ifstream wnsfile(wnpref + "wn_s.pl"); + if (!wnsfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + for (;;) + { + std::string line; + if (!getline(wnsfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string word = relation_data[3]; + + std::string query; + switch (synset_id / 100000000) + { + case 1: // Noun + { + query = "INSERT INTO nouns (form) VALUES (?)"; + + break; + } + + case 2: // Verb + { + // Ignore + + break; + } + + case 3: // Adjective + { + query = "INSERT INTO adjectives (form) VALUES (?)"; + + break; + } + + case 4: // Adverb + { + query = "INSERT INTO adverbs (form) VALUES (?)"; + + break; + } + } + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + sqlite3_finalize(ppstmt); + + query = "SELECT last_insert_rowid()"; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + if (sqlite3_step(ppstmt) != SQLITE_ROW) + { + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); + } + + wn[synset_id][wnum] = sqlite3_column_int(ppstmt, 0); + + sqlite3_finalize(ppstmt); + } + + sqlite3_close_v2(ppdb); + + std::cout << "Done." << std::endl; +} \ No newline at end of file -- cgit 1.4.1