From 6746da6edd7d9d50efe374eabbb79a3cac882d81 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Mon, 16 Jan 2017 18:02:50 -0500 Subject: Started structural rewrite The new object structure was designed to build on the existing WordNet structure, while also adding in all of the data that we get from other sources. More information about this can be found on the project wiki. The generator has already been completely rewritten to generate a datafile that uses the new structure. In addition, a number of indexes are created, which does double the size of the datafile, but also allows for much faster lookups. Finally, the new generator is written modularly and is a lot more readable than the old one. The verbly interface to the new object structure has mostly been completed, but has not been tested fully. There is a completely new search API which utilizes a lot of operator overloading; documentation on how to use it should go up at some point. Token processing and verb frames are currently unimplemented. Source for these have been left in the repository for now. --- generator/generator.cpp | 3227 +++++++++++++++++------------------------------ 1 file changed, 1192 insertions(+), 2035 deletions(-) (limited to 'generator/generator.cpp') diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -1,2320 +1,1477 @@ -#include +#include "generator.h" +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "progress.h" -#include "../lib/util.h" - -using json = nlohmann::json; - -struct verb_t { - std::string infinitive; - std::string past_tense; - std::string past_participle; - std::string ing_form; - std::string s_form; - int id; -}; - -struct adjective_t { - std::string base; - std::string comparative; - std::string superlative; -}; - -struct noun_t { - std::string singular; - std::string plural; -}; - -struct selrestr_t { - enum class type_t { - singleton, - andlogic, - orlogic, - empty - }; - type_t type; - std::string restriction; - bool pos; - std::list subordinates; -}; - -struct framepart_t { - enum class type_t { - np, - v, - pp, - adj, - adv, - lex - }; - type_t type; - std::string role; - selrestr_t selrestrs; - std::set preprestrs; - std::set synrestrs; - std::list choices; - std::string lexval; -}; - -struct group_t { - std::string id; - std::string parent; - std::set members; - std::map roles; - std::list> frames; -}; - -struct pronunciation_t { - std::string phonemes; - std::string prerhyme; - std::string rhyme; - int syllables = 0; - std::string stress; - - bool operator<(const pronunciation_t& other) const - { - return phonemes < other.phonemes; - } -}; - -std::map groups; -std::map verbs; -std::map adjectives; -std::map nouns; -std::map> wn; -std::map images; -std::map> pronunciations; - -void print_usage() -{ - std::cout << "Verbly Datafile Generator" << std::endl; - std::cout << "-------------------------" << std::endl; - std::cout << "Requires exactly six arguments." << std::endl; - std::cout << "1. The path to a VerbNet data directory." << std::endl; - std::cout << "2. The path to an AGID infl.txt file." << std::endl; - std::cout << "3. The path to a WordNet prolog data directory." << std::endl; - std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; - std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; - std::cout << "6. Datafile output path." << std::endl; - - exit(1); -} - -void db_error(sqlite3* ppdb, std::string query) -{ - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - std::cout << query << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); -} - -json export_selrestrs(selrestr_t r) -{ - if (r.type == selrestr_t::type_t::empty) - { - return {}; - } else if (r.type == selrestr_t::type_t::singleton) - { - json result; - result["type"] = r.restriction; - result["pos"] = r.pos; - return result; - } else { - json result; - if (r.type == selrestr_t::type_t::andlogic) - { - result["logic"] = "and"; - } else { - result["logic"] = "or"; - } - - std::list outlist; - std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); - result["children"] = outlist; - - return result; - } -} - -selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) -{ - selrestr_t r; - xmlChar* key; - - if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) - { - if (xmlChildElementCount(top) == 0) - { - r.type = selrestr_t::type_t::empty; - } else if (xmlChildElementCount(top) == 1) - { - r = parse_selrestrs(xmlFirstElementChild(top), filename); - } else { - r.type = selrestr_t::type_t::andlogic; - - if (xmlHasProp(top, (const xmlChar*) "logic")) - { - key = xmlGetProp(top, (const xmlChar*) "logic"); - if (!xmlStrcmp(key, (const xmlChar*) "or")) - { - r.type = selrestr_t::type_t::orlogic; - } - xmlFree(key); - } - - for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) - { - if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) - { - r.subordinates.push_back(parse_selrestrs(selrestr, filename)); - } - } - } - } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) - { - r.type = selrestr_t::type_t::singleton; - - key = xmlGetProp(top, (xmlChar*) "Value"); - r.pos = (std::string((const char*)key) == "+"); - xmlFree(key); - - key = xmlGetProp(top, (xmlChar*) "type"); - r.restriction = (const char*) key; - xmlFree(key); - } else { - // Invalid - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - - return r; -} - -group_t& parse_group(xmlNodePtr top, std::string filename) -{ - xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); - if (key == 0) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - std::string vnid = (const char*)key; - vnid = vnid.substr(vnid.find_first_of("-")+1); - xmlFree(key); - - group_t g; - g.id = vnid; - - for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) - { - if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) - { - for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) - { - if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) - { - auto& sg = parse_group(subclass, filename); - sg.parent = vnid; - - for (auto member : sg.members) - { - g.members.insert(member); - } - - // The schema requires that subclasses appear after role definitions, so we can do this now - for (auto role : g.roles) - { - if (sg.roles.count(role.first) == 0) - { - sg.roles[role.first] = role.second; - } - } - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) - { - for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) - { - if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) - { - key = xmlGetProp(member, (xmlChar*) "name"); - g.members.insert((const char*)key); - xmlFree(key); - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) - { - for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) - { - if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) - { - selrestr_t r; - r.type = selrestr_t::type_t::empty; - - key = xmlGetProp(role, (const xmlChar*) "type"); - std::string type = (const char*)key; - xmlFree(key); - - for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) - { - if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) - { - r = parse_selrestrs(rolenode, filename); - } - } - - g.roles[type] = r; - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) - { - for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) - { - if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) - { - std::list f; - - for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) - { - if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) - { - for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) - { - framepart_t fp; - - if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) - { - fp.type = framepart_t::type_t::np; - - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - fp.role = (const char*)key; - xmlFree(key); - - fp.selrestrs.type = selrestr_t::type_t::empty; - - for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) - { - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) - { - for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) - { - if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) - { - key = xmlGetProp(synrestr, (xmlChar*) "type"); - fp.synrestrs.insert(std::string((const char*)key)); - xmlFree(key); - } - } - } - - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) - { - fp.selrestrs = parse_selrestrs(npnode, filename); - } - } - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) - { - fp.type = framepart_t::type_t::v; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) - { - fp.type = framepart_t::type_t::pp; - - if (xmlHasProp(syntaxnode, (xmlChar*) "value")) - { - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - std::string choices = (const char*)key; - xmlFree(key); - - fp.choices = verbly::split>(choices, " "); - } - - for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) - { - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) - { - for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) - { - if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) - { - key = xmlGetProp(synrestr, (xmlChar*) "type"); - fp.preprestrs.insert(std::string((const char*)key)); - xmlFree(key); - } - } - } - } - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) - { - fp.type = framepart_t::type_t::adj; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) - { - fp.type = framepart_t::type_t::adv; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) - { - fp.type = framepart_t::type_t::lex; - - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - fp.lexval = (const char*)key; - xmlFree(key); - } else { - continue; - } - - f.push_back(fp); - } - - g.frames.push_back(f); - } - } - } - } - } - } - - groups[vnid] = g; - - return groups[vnid]; -} - -int main(int argc, char** argv) -{ - if (argc != 7) - { - print_usage(); - } - - // VerbNet data - std::cout << "Reading verb frames..." << std::endl; - - DIR* dir; - if ((dir = opendir(argv[1])) == nullptr) - { - std::cout << "Invalid VerbNet data directory." << std::endl; - - print_usage(); - } - - struct dirent* ent; - while ((ent = readdir(dir)) != nullptr) - { - std::string filename(argv[1]); - if (filename.back() != '/') - { - filename += '/'; - } - - filename += ent->d_name; - //std::cout << ent->d_name << std::endl; - - if (filename.rfind(".xml") != filename.size() - 4) - { - continue; - } - - xmlDocPtr doc = xmlParseFile(filename.c_str()); - if (doc == nullptr) - { - std::cout << "Error opening " << filename << std::endl; - print_usage(); - } - - xmlNodePtr top = xmlDocGetRootElement(doc); - if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - - parse_group(top, filename); - } - - closedir(dir); - - // Get verbs from AGID - std::cout << "Reading inflections..." << std::endl; - - std::ifstream agidfile(argv[2]); - if (!agidfile.is_open()) - { - std::cout << "Could not open AGID file: " << argv[2] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(agidfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - int divider = line.find_first_of(" "); - std::string word = line.substr(0, divider); - line = line.substr(divider+1); - char type = line[0]; - - if (line[1] == '?') - { - line.erase(0, 4); - } else { - line.erase(0, 3); - } - - std::vector forms; - while (!line.empty()) - { - std::string inflection; - if ((divider = line.find(" | ")) != std::string::npos) - { - inflection = line.substr(0, divider); - line = line.substr(divider + 3); - } else { - inflection = line; - line = ""; - } - - if ((divider = inflection.find_first_of(",?")) != std::string::npos) - { - inflection = inflection.substr(0, divider); - } - - forms.push_back(inflection); - } - - switch (type) - { - case 'V': - { - verb_t v; - v.infinitive = word; - if (forms.size() == 4) - { - v.past_tense = forms[0]; - v.past_participle = forms[1]; - v.ing_form = forms[2]; - v.s_form = forms[3]; - } else if (forms.size() == 3) - { - v.past_tense = forms[0]; - v.past_participle = forms[0]; - v.ing_form = forms[1]; - v.s_form = forms[2]; - } else if (forms.size() == 8) - { - // As of AGID 2014.08.11, this is only "to be" - v.past_tense = forms[0]; - v.past_participle = forms[2]; - v.ing_form = forms[3]; - v.s_form = forms[4]; - } else { - // Words that don't fit the cases above as of AGID 2014.08.11: - // - may and shall do not conjugate the way we want them to - // - methinks only has a past tense and is an outlier - // - wit has five forms, and is archaic/obscure enough that we can ignore it for now - std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - verbs[word] = v; - - break; - } - - case 'A': - { - adjective_t adj; - adj.base = word; - if (forms.size() == 2) - { - adj.comparative = forms[0]; - adj.superlative = forms[1]; - } else { - // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" - std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - adjectives[word] = adj; - - break; - } - - case 'N': - { - noun_t n; - n.singular = word; - if (forms.size() == 1) - { - n.plural = forms[0]; - } else { - // As of AGID 2014.08.11, this is non-existent. - std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - nouns[word] = n; - - break; - } - } - } - - // Pronounciations - std::cout << "Reading pronunciations..." << std::endl; - - std::ifstream pronfile(argv[4]); - if (!pronfile.is_open()) - { - std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(pronfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); - std::smatch phoneme_data; - if (std::regex_search(line, phoneme_data, phoneme)) - { - std::string canonical(phoneme_data[1]); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - - std::string phonemes = phoneme_data[2]; - auto phoneme_set = verbly::split>(phonemes, " "); - auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { - return phoneme.find("1") != std::string::npos; - }); - - pronunciation_t p; - p.phonemes = phonemes; - - // Rhyme detection - if (phemstrt != std::end(phoneme_set)) - { - std::stringstream rhymer; - for (auto it = phemstrt; it != std::end(phoneme_set); it++) - { - std::string naked; - std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { - return isdigit(ch); - }); - - if (it != phemstrt) - { - rhymer << " "; - } - - rhymer << naked; - } - - p.rhyme = rhymer.str(); - - if (phemstrt != std::begin(phoneme_set)) - { - phemstrt--; - p.prerhyme = *phemstrt; - } else { - p.prerhyme = ""; - } - } else { - p.prerhyme = ""; - p.rhyme = ""; - } - - // Syllable/stress - for (auto phm : phoneme_set) - { - if (isdigit(phm.back())) - { - // It's a vowel! - p.syllables++; - - if (phm.back() == '1') - { - p.stress.push_back('1'); - } else { - p.stress.push_back('0'); - } - } - } - - pronunciations[canonical].insert(p); - } - } - - // Images - std::cout << "Reading images..." << std::endl; - - std::ifstream imagefile(argv[5]); - if (!imagefile.is_open()) - { - std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(imagefile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::string wnid_s = line.substr(1, 8); - int wnid = stoi(wnid_s) + 100000000; - images[wnid]++; - } - - imagefile.close(); - - // Start writing output - std::cout << "Writing schema..." << std::endl; - - sqlite3* ppdb; - if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) - { - std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; - print_usage(); - } - - std::ifstream schemafile("schema.sql"); - if (!schemafile.is_open()) - { - std::cout << "Could not find schema file" << std::endl; - print_usage(); - } - - std::stringstream schemabuilder; - for (;;) - { - std::string line; - if (!getline(schemafile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - schemabuilder << line << std::endl; - } - - std::string schema = schemabuilder.str(); - while (!schema.empty()) - { - std::string query; - int divider = schema.find(";"); - if (divider != std::string::npos) - { - query = schema.substr(0, divider+1); - schema = schema.substr(divider+2); - } else { - break; - } - - sqlite3_stmt* schmstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(schmstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(schmstmt); - } - - std::cout << "Writing prepositions..." << std::endl; - std::ifstream prepfile("prepositions.txt"); - if (!prepfile.is_open()) - { - std::cout << "Could not find prepositions file" << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(prepfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::regex relation("^([^:]+): (.+)"); - std::smatch relation_data; - std::regex_search(line, relation_data, relation); - std::string prep = relation_data[1]; - std::list groups = verbly::split>(relation_data[2], ", "); - - std::string query("INSERT INTO prepositions (form) VALUES (?)"); - sqlite3_stmt* ppstmt; - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - sqlite3_finalize(ppstmt); - - for (auto group : groups) - { - query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - - { - progress ppgs("Writing verbs...", verbs.size()); - for (auto& mapping : verbs) - { - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - std::string canonical(mapping.second.infinitive); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) - { - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - - sqlite3_finalize(ppstmt); - - mapping.second.id = rowid; - - for (auto pronunciation : pronunciations[canonical]) - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); - sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); - - if (!pronunciation.rhyme.empty()) - { - sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); - } - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - ppgs.update(); - } - } - - { - progress ppgs("Writing verb frames...", groups.size()); - for (auto& mapping : groups) +#include +#include +#include +#include "enums.h" +#include "progress.h" +#include "selrestr.h" +#include "role.h" +#include "part.h" +#include "field.h" +#include "../lib/util.h" + +namespace verbly { + namespace generator { + + generator::generator( + std::string verbNetPath, + std::string agidPath, + std::string wordNetPath, + std::string cmudictPath, + std::string imageNetPath, + std::string outputPath) : + verbNetPath_(verbNetPath), + agidPath_(agidPath), + wordNetPath_(wordNetPath), + cmudictPath_(cmudictPath), + imageNetPath_(imageNetPath), + db_(outputPath) { - std::list roledatal; - std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair r) { - json role; - role["type"] = r.first; - role["selrestrs"] = export_selrestrs(r.second); - - return role; - }); - - json roledata(roledatal); - std::string rdm = roledata.dump(); - - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO groups (data) VALUES (?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + // Ensure VerbNet directory exists + DIR* dir; + if ((dir = opendir(verbNetPath_.c_str())) == nullptr) { - db_error(ppdb, query); + throw std::invalid_argument("Invalid VerbNet data directory"); } - sqlite3_finalize(ppstmt); + closedir(dir); - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + // Ensure AGID infl.txt exists + if (!std::ifstream(agidPath_)) { - db_error(ppdb, query); + throw std::invalid_argument("AGID infl.txt file not found"); } - if (sqlite3_step(ppstmt) != SQLITE_ROW) + // Add directory separator to WordNet path + if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { - db_error(ppdb, query); + wordNetPath_ += '/'; } - int gid = sqlite3_column_int(ppstmt, 0); - sqlite3_finalize(ppstmt); - - for (auto frame : mapping.second.frames) + // Ensure WordNet tables exist + for (std::string table : { + "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" + }) { - std::list fdatap; - std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { - json part; - - switch (fp.type) - { - case framepart_t::type_t::np: - { - part["type"] = "np"; - part["role"] = fp.role; - part["selrestrs"] = export_selrestrs(fp.selrestrs); - part["synrestrs"] = fp.synrestrs; - - break; - } - - case framepart_t::type_t::pp: - { - part["type"] = "pp"; - part["values"] = fp.choices; - part["preprestrs"] = fp.preprestrs; - - break; - } - - case framepart_t::type_t::v: - { - part["type"] = "v"; - - break; - } - - case framepart_t::type_t::adj: - { - part["type"] = "adj"; - - break; - } - - case framepart_t::type_t::adv: - { - part["type"] = "adv"; - - break; - } - - case framepart_t::type_t::lex: - { - part["type"] = "lex"; - part["value"] = fp.lexval; - - break; - } - } - - return part; - }); - - json fdata(fdatap); - std::string marshall = fdata.dump(); - - query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, gid); - sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { - db_error(ppdb, query); + throw std::invalid_argument("WordNet " + table + " table not found"); } - - sqlite3_finalize(ppstmt); } - for (auto member : mapping.second.members) + // Ensure CMUDICT file exists + if (!std::ifstream(cmudictPath_)) { - if (verbs.count(member) == 1) - { - auto& v = verbs[member]; - - query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, v.id); - sqlite3_bind_int(ppstmt, 2, gid); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } + throw std::invalid_argument("CMUDICT file not found"); } - ppgs.update(); - } - } - - // Get nouns/adjectives/adverbs from WordNet - // Useful relations: - // - s: master list - // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) - // - at: variation (e.g. a measurement can be standard or nonstandard) - // - der: derivation (e.g. happy/happily, happily/happy) - // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) - // - ins: instantiation (do we need this? let's see) - // - mm: member meronymy/holonymy (e.g. family/mother, family/child) - // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) - // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) - // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) - // mannernymy (e.g. something done quickly is done in a manner that is quick) - // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) - // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) - // - syntax: positioning flags for some adjectives - std::string wnpref {argv[3]}; - if (wnpref.back() != '/') - { - wnpref += '/'; - } - - // s table - { - std::ifstream wnsfile(wnpref + "wn_s.pl"); - if (!wnsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsfile, line)) + // Ensure ImageNet urls.txt exists + if (!std::ifstream(imageNetPath_)) { - break; + throw std::invalid_argument("ImageNet urls.txt file not found"); } + } - if (line.back() == '\r') - { - line.pop_back(); - } + void generator::run() + { + // Create notions, words, lemmas, and forms from WordNet synsets + readWordNetSynsets(); + + // Reads adjective positioning WordNet data + readAdjectivePositioning(); + + // Counts the number of URLs ImageNet has per notion + readImageNetUrls(); + + // Creates a word by WordNet sense key lookup table + readWordNetSenseKeys(); + + // Creates groups and frames from VerbNet data + readVerbNet(); + + // Creates forms and inflections from AGID. To reduce the amount of forms + // created, we do this after most lemmas that need inflecting have been + // created through other means, and then only generate forms for + // inflections of already-existing lemmas. The exception to this regards + // verb lemmas. If a verb lemma in AGID either does not exist yet, or does + // exist but is not related to any words that are related to verb notions, + // then a notion and a word is generated and the form generation proceeds + // as usual. + readAgidInflections(); + + // Reads in prepositions and the is_a relationship + readPrepositions(); + + // Creates pronunciations from CMUDICT. To reduce the amount of + // pronunciations created, we do this after all forms have been created, + // and then only generate pronunciations for already-exisiting forms. + readCmudictPronunciations(); + + // Writes the database schema + writeSchema(); + + // Dumps data to the database + dumpObjects(); + + // Populates the antonymy relationship from WordNet + readWordNetAntonymy(); + + // Populates the variation relationship from WordNet + readWordNetVariation(); + + // Populates the usage, topicality, and regionality relationships from + // WordNet + readWordNetClasses(); + + // Populates the causality relationship from WordNet + readWordNetCausality(); + + // Populates the entailment relationship from WordNet + readWordNetEntailment(); + + // Populates the hypernymy relationship from WordNet + readWordNetHypernymy(); + + // Populates the instantiation relationship from WordNet + readWordNetInstantiation(); + + // Populates the member meronymy relationship from WordNet + readWordNetMemberMeronymy(); + + // Populates the part meronymy relationship from WordNet + readWordNetPartMeronymy(); + + // Populates the substance meronymy relationship from WordNet + readWordNetSubstanceMeronymy(); + + // Populates the pertainymy and mannernymy relationships from WordNet + readWordNetPertainymy(); + + // Populates the specification relationship from WordNet + readWordNetSpecification(); + + // Populates the adjective similarity relationship from WordNet + readWordNetSimilarity(); + + + + + + + - lines.push_back(line); } - progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); - for (auto line : lines) + void generator::readWordNetSynsets() { - ppgs.update(); + std::list lines(readFile(wordNetPath_ + "wn_s.pl")); + progress ppgs("Reading synsets from WordNet...", lines.size()); - std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string word = relation_data[3]; - size_t word_it; - while ((word_it = word.find("''")) != std::string::npos) - { - word.erase(word_it, 1); - } - - std::string query; - switch (synset_id / 100000000) + for (std::string line : lines) { - case 1: // Noun - { - if (nouns.count(word) == 1) - { - query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; - } + ppgs.update(); - break; - } - - case 2: // Verb + std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - // Ignore - - break; + continue; } - - case 3: // Adjective + + int synset_id = std::stoi(relation_data[1]); + int wnum = std::stoi(relation_data[2]); + std::string text = relation_data[3]; + int tag_count = std::stoi(relation_data[4]); + size_t word_it; + while ((word_it = text.find("''")) != std::string::npos) { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; - } else { - query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; - } - - break; + text.erase(word_it, 1); } - - case 4: // Adverb - { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; - } else { - query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; - } - break; + // The WordNet data does contain duplicates, so we need to check that we + // haven't already created this word. + std::pair lookup(synset_id, wnum); + if (!wordByWnidAndWnum_.count(lookup)) + { + notion& synset = lookupOrCreateNotion(synset_id); + lemma& lex = lookupOrCreateLemma(text); + word& entry = createWord(synset, lex, tag_count); + + wordByWnidAndWnum_[lookup] = &entry; } } + } - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); - switch (synset_id / 100000000) + void generator::readAdjectivePositioning() + { + std::list lines(readFile(wordNetPath_ + "wn_syntax.pl")); + progress ppgs("Reading adjective positionings from WordNet...", lines.size()); + + for (std::string line : lines) { - case 1: // Noun + ppgs.update(); + + std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { - return isupper(ch); - }) ? 1 : 0)); - - sqlite3_bind_int(ppstmt, 3, verbly::split>(word, " ").size()); - sqlite3_bind_int(ppstmt, 4, images[synset_id]); - sqlite3_bind_int(ppstmt, 5, synset_id); - - if (nouns.count(word) == 1) - { - sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); - } - - break; + continue; } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string adjpos_str = relation_data[3]; - case 3: // Adjective - case 4: // Adverb + std::pair lookup(synset_id, wnum); + if (wordByWnidAndWnum_.count(lookup)) { - sqlite3_bind_int(ppstmt, 2, verbly::split>(word, " ").size()); + word& adj = *wordByWnidAndWnum_.at(lookup); - if (adjectives.count(word) == 1) + if (adjpos_str == "p") + { + adj.setAdjectivePosition(positioning::predicate); + } else if (adjpos_str == "a") { - sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); + adj.setAdjectivePosition(positioning::attributive); + } else if (adjpos_str == "i") + { + adj.setAdjectivePosition(positioning::postnominal); + } else { + // Can't happen because of how we specified the regex. + assert(false); } - - break; } } + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + void generator::readImageNetUrls() + { + // The ImageNet datafile is so large that it is unreasonable and + // unnecessary to read it into memory; instead, we will parse each line as + // we read it. This has the caveat that we cannot display a progress bar. + std::cout << "Reading image counts from ImageNet..." << std::endl; + + std::ifstream file(imageNetPath_); + if (!file) { - db_error(ppdb, query); + throw std::invalid_argument("Could not find file " + imageNetPath_); } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) + + std::string line; + while (std::getline(file, line)) { - db_error(ppdb, query); + if (line.back() == '\r') + { + line.pop_back(); + } + + std::string wnid_s = line.substr(1, 8); + int wnid = stoi(wnid_s) + 100000000; + if (notionByWnid_.count(wnid)) + { + // We know that this notion has a wnid and is a noun. + notionByWnid_.at(wnid)->incrementNumOfImages(); + } } + } - int rowid = sqlite3_column_int(ppstmt, 0); - wn[synset_id][wnum] = rowid; - - sqlite3_finalize(ppstmt); + void generator::readWordNetSenseKeys() + { + std::list lines(readFile(wordNetPath_ + "wn_sk.pl")); + progress ppgs("Reading sense keys from WordNet...", lines.size()); - std::string canonical(word); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) + for (std::string line : lines) { - for (auto pronunciation : pronunciations[canonical]) - { - switch (synset_id / 100000000) - { - case 1: // Noun - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - - case 3: // Adjective - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - - case 4: // Adverb - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - } - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + ppgs.update(); - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); - sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); - - if (!pronunciation.rhyme.empty()) - { - sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); - } + // We only actually need to lookup verbs by sense key so we'll just + // ignore everything that isn't a verb. + std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string sense_key = relation_data[3]; - if (sqlite3_step(ppstmt) != SQLITE_DONE) + // We are treating this mapping as injective, which is not entirely + // accurate. First, the WordNet table contains duplicate rows, so those + // need to be ignored. More importantly, a small number of sense keys + // (one for each letter of the Latin alphabet, plus 9 other words) each + // map to two different words in the same synset which differ only by + // capitalization. Luckily, none of these exceptions are verbs, so we + // can pretend that the mapping is injective. + if (!wnSenseKeys_.count(sense_key)) + { + std::pair lookup(synset_id, wnum); + if (wordByWnidAndWnum_.count(lookup)) { - db_error(ppdb, query); + wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); } - - sqlite3_finalize(ppstmt); } } } - } - - // While we're working on s - { - progress ppgs("Writing word synonyms...", wn.size()); - for (auto sense : wn) + + void generator::readVerbNet() { - ppgs.update(); - - for (auto word1 : sense.second) + std::cout << "Reading frames from VerbNet..." << std::endl; + + DIR* dir; + if ((dir = opendir(verbNetPath_.c_str())) == nullptr) + { + throw std::invalid_argument("Invalid VerbNet data directory"); + } + + struct dirent* ent; + while ((ent = readdir(dir)) != nullptr) { - for (auto word2 : sense.second) + std::string filename(verbNetPath_); + + if (filename.back() != '/') { - if (word1 != word2) - { - std::string query; - switch (sense.first / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; - } - - case 2: // Verb - { - // Ignore - - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + filename += '/'; + } - sqlite3_bind_int(ppstmt, 1, word1.second); - sqlite3_bind_int(ppstmt, 2, word2.second); + filename += ent->d_name; - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + if (filename.rfind(".xml") != filename.size() - 4) + { + continue; + } - sqlite3_finalize(ppstmt); - } + xmlDocPtr doc = xmlParseFile(filename.c_str()); + if (doc == nullptr) + { + throw std::logic_error("Error opening " + filename); } - } - } - } - - // ant table - { - std::ifstream wnantfile(wnpref + "wn_ant.pl"); - if (!wnantfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnantfile, line)) - { - break; - } - if (line.back() == '\r') - { - line.pop_back(); + xmlNodePtr top = xmlDocGetRootElement(doc); + if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast("VNCLASS")))) + { + throw std::logic_error("Bad VerbNet file format: " + filename); + } + + try + { + createGroup(top); + } catch (const std::exception& e) + { + std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); + } } - - lines.push_back(line); + + closedir(dir); } - progress ppgs("Writing antonyms...", lines.size()); - for (auto line : lines) + void generator::readAgidInflections() { - ppgs.update(); + std::list lines(readFile(agidPath_)); + progress ppgs("Reading inflections from AGID...", lines.size()); - std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + for (std::string line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); + ppgs.update(); + + int divider = line.find_first_of(" "); + std::string infinitive = line.substr(0, divider); + line = line.substr(divider+1); + char type = line[0]; - std::string query; - switch (synset_id_1 / 100000000) - { - case 1: // Noun + if (line[1] == '?') { - query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; + line.erase(0, 4); + } else { + line.erase(0, 3); } - - case 2: // Verb + + if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) { - // Ignore + continue; + } - break; + lemma& curLemma = lookupOrCreateLemma(infinitive); + + auto forms = split>(line, " | "); + for (std::string& inflForm : forms) + { + int sympos = inflForm.find_first_of(",?"); + if (sympos != std::string::npos) + { + inflForm = inflForm.substr(0, sympos); + } } - - case 3: // Adjective + + switch (type) { - query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; + case 'V': + { + if (forms.size() == 4) + { + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); + } else if (forms.size() == 3) + { + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); + } else if (forms.size() == 8) + { + // As of AGID 2014.08.11, this is only "to be" + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); + } else { + // Words that don't fit the cases above as of AGID 2014.08.11: + // - may and shall do not conjugate the way we want them to + // - methinks only has a past tense and is an outlier + // - wit has five forms, and is archaic/obscure enough that we can ignore it for now + std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } + + // For verbs in particular, we sometimes create a notion and a word + // from inflection data. Specifically, if there are not yet any + // verbs existing that have the same infinitive form. "Yet" means + // that this verb appears in the AGID data but not in either WordNet + // or VerbNet. + if (!wordsByBaseForm_.count(infinitive) + || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { + return w->getNotion().getPartOfSpeech() == part_of_speech::verb; + })) + { + notion& n = createNotion(part_of_speech::verb); + createWord(n, curLemma); + } - break; - } + break; + } - case 4: // Adverb - { - query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; + case 'A': + { + if (forms.size() == 2) + { + curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); + } else { + // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" + std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } + + break; + } + + case 'N': + { + if (forms.size() == 1) + { + curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); + } else { + // As of AGID 2014.08.11, this is non-existent. + std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } - break; + break; + } } } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); } - } - - // at table - { - std::ifstream wnatfile(wnpref + "wn_at.pl"); - if (!wnatfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnatfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readPrepositions() + { + std::list lines(readFile("prepositions.txt")); + progress ppgs("Reading prepositions...", lines.size()); + + for (std::string line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^([^:]+): (.+)"); + std::smatch relation_data; + std::regex_search(line, relation_data, relation); + std::string prep = relation_data[1]; + auto groups = split>(relation_data[2], ", "); + + notion& n = createNotion(part_of_speech::preposition); + lemma& l = lookupOrCreateLemma(prep); + word& w = createWord(n, l); + + n.setPrepositionGroups(groups); } - - lines.push_back(line); } - progress ppgs("Writing variations...", lines.size()); - for (auto line : lines) + void generator::readCmudictPronunciations() { - ppgs.update(); - - std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); + std::list lines(readFile(cmudictPath_)); + progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); - for (auto mapping1 : wn[synset_id_1]) + for (std::string line : lines) { - for (auto mapping2 : wn[synset_id_2]) + ppgs.update(); + + std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); + std::smatch phoneme_data; + if (std::regex_search(line, phoneme_data, phoneme)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + std::string canonical(phoneme_data[1]); + std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (!formByText_.count(canonical)) { - db_error(ppdb, query); + continue; } - sqlite3_finalize(ppstmt); + std::string phonemes = phoneme_data[2]; + pronunciations_.emplace_back(phonemes); + pronunciation& p = pronunciations_.back(); + formByText_.at(canonical)->addPronunciation(p); } } } - } - - // der table - { - std::ifstream wnderfile(wnpref + "wn_der.pl"); - if (!wnderfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - std::list lines; - for (;;) + void generator::writeSchema() { - std::string line; - if (!getline(wnderfile, line)) + std::ifstream file("schema.sql"); + if (!file) { - break; + throw std::invalid_argument("Could not find database schema"); } - - if (line.back() == '\r') + + std::ostringstream schemaBuilder; + std::string line; + while (std::getline(file, line)) { - line.pop_back(); + if (line.back() == '\r') + { + line.pop_back(); + } + + schemaBuilder << line; } - lines.push_back(line); + std::string schema = schemaBuilder.str(); + auto queries = split>(schema, ";"); + progress ppgs("Writing database schema...", queries.size()); + for (std::string query : queries) + { + if (!queries.empty()) + { + db_.runQuery(query); + } + + ppgs.update(); + } } - progress ppgs("Writing morphological derivation...", lines.size()); - for (auto line : lines) + void generator::dumpObjects() { - ppgs.update(); - - std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) { - continue; + progress ppgs("Writing notions...", notions_.size()); + + for (notion& n : notions_) + { + db_ << n; + + ppgs.update(); + } } - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query; - switch (synset_id_1 / 100000000) { - case 1: // Noun + progress ppgs("Writing words...", words_.size()); + + for (word& w : words_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; - break; - } - } + db_ << w; - break; + ppgs.update(); } + } + + { + progress ppgs("Writing lemmas...", lemmas_.size()); - case 3: // Adjective + for (lemma& l : lemmas_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; - break; - } - } + db_ << l; - break; + ppgs.update(); } + } + + { + progress ppgs("Writing forms...", forms_.size()); - case 4: // Adverb + for (form& f : forms_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - break; - } - } + db_ << f; - break; + ppgs.update(); } } - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) { - db_error(ppdb, query); + progress ppgs("Writing pronunciations...", pronunciations_.size()); + + for (pronunciation& p : pronunciations_) + { + db_ << p; + + ppgs.update(); + } } - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) { - db_error(ppdb, query); + progress ppgs("Writing verb groups...", groups_.size()); + + for (group& g : groups_) + { + db_ << g; + + ppgs.update(); + } } - sqlite3_finalize(ppstmt); - } - } - - // hyp table - { - std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); - if (!wnhypfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnhypfile, line)) { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); + progress ppgs("Writing verb frames...", frames_.size()); + + for (frame& f : frames_) + { + db_ << f; + + ppgs.update(); + } } - - lines.push_back(line); } - progress ppgs("Writing hypernyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetAntonymy() { - ppgs.update(); - - std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_ant.pl")); + progress ppgs("Writing antonyms...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("antonym_1_id", word1.getId()); + fields.emplace_back("antonym_2_id", word2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("antonymy", std::move(fields)); } } } - } - - // ins table - { - std::ifstream wninsfile(wnpref + "wn_ins.pl"); - if (!wninsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wninsfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetVariation() + { + std::list lines(readFile(wordNetPath_ + "wn_at.pl")); + progress ppgs("Writing variation...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("noun_id", notion1.getId()); + fields.emplace_back("adjective_id", notion2.getId()); + + db_.insertIntoTable("variation", std::move(fields)); + } + } } - progress ppgs("Writing instantiations...", lines.size()); - for (auto line : lines) + void generator::readWordNetClasses() { - ppgs.update(); - - std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_cls.pl")); + progress ppgs("Writing usage, topicality, and regionality...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + std::string class_type = relation_data[5]; + + std::string table_name; + if (class_type == "t") + { + table_name += "topicality"; + } else if (class_type == "u") + { + table_name += "usage"; + } else if (class_type == "r") + { + table_name += "regionality"; + } + + std::list leftJoin; + std::list rightJoin; + + if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) + { + std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { + return w->getId(); + }); + } else if (wordByWnidAndWnum_.count(lookup1)) { + leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); + } + + if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) + { + std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { + return w->getId(); + }); + } else if (wordByWnidAndWnum_.count(lookup2)) { + rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); + } + + for (int word1 : leftJoin) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + for (int word2 : rightJoin) { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + std::list fields; + fields.emplace_back("term_id", word1); + fields.emplace_back("domain_id", word2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); + db_.insertIntoTable(table_name, std::move(fields)); } - - sqlite3_finalize(ppstmt); } } } - } - - // mm table - { - std::ifstream wnmmfile(wnpref + "wn_mm.pl"); - if (!wnmmfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnmmfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetCausality() + { + std::list lines(readFile(wordNetPath_ + "wn_cs.pl")); + progress ppgs("Writing causality...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("effect_id", notion1.getId()); + fields.emplace_back("cause_id", notion2.getId()); + + db_.insertIntoTable("causality", std::move(fields)); + } + } } - progress ppgs("Writing member meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetEntailment() { - ppgs.update(); - - std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_ent.pl")); + progress ppgs("Writing entailment...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("given_id", notion1.getId()); + fields.emplace_back("entailment_id", notion2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("entailment", std::move(fields)); } } } - } - - // ms table - { - std::ifstream wnmsfile(wnpref + "wn_ms.pl"); - if (!wnmsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) + + void generator::readWordNetHypernymy() { - std::string line; - if (!getline(wnmsfile, line)) + std::list lines(readFile(wordNetPath_ + "wn_hyp.pl")); + progress ppgs("Writing hypernymy...", lines.size()); + for (auto line : lines) { - break; + ppgs.update(); + + std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("hyponym_id", notion1.getId()); + fields.emplace_back("hypernym_id", notion2.getId()); + + db_.insertIntoTable("hypernymy", std::move(fields)); + } } + } - if (line.back() == '\r') + void generator::readWordNetInstantiation() + { + std::list lines(readFile(wordNetPath_ + "wn_ins.pl")); + progress ppgs("Writing instantiation...", lines.size()); + for (auto line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("instance_id", notion1.getId()); + fields.emplace_back("class_id", notion2.getId()); + + db_.insertIntoTable("instantiation", std::move(fields)); + } } - - lines.push_back(line); } - progress ppgs("Writing substance meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetMemberMeronymy() { - ppgs.update(); - - std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_mm.pl")); + progress ppgs("Writing member meronymy...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("member_meronymy", std::move(fields)); } } } - } - - // mm table - { - std::ifstream wnmpfile(wnpref + "wn_mp.pl"); - if (!wnmpfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) + + void generator::readWordNetPartMeronymy() { - std::string line; - if (!getline(wnmpfile, line)) + std::list lines(readFile(wordNetPath_ + "wn_mp.pl")); + progress ppgs("Writing part meronymy...", lines.size()); + for (auto line : lines) { - break; + ppgs.update(); + + std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); + + db_.insertIntoTable("part_meronymy", std::move(fields)); + } } + } - if (line.back() == '\r') + void generator::readWordNetSubstanceMeronymy() + { + std::list lines(readFile(wordNetPath_ + "wn_ms.pl")); + progress ppgs("Writing substance meronymy...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); + + db_.insertIntoTable("substance_meronymy", std::move(fields)); + } + } } - progress ppgs("Writing part meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetPertainymy() { - ppgs.update(); - - std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_per.pl")); + progress ppgs("Writing pertainymy and mannernymy...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) + { + std::list fields; + fields.emplace_back("pertainym_id", word1.getId()); + fields.emplace_back("noun_id", word2.getId()); - if (sqlite3_step(ppstmt) != SQLITE_DONE) + db_.insertIntoTable("pertainymy", std::move(fields)); + } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("mannernym_id", word1.getId()); + fields.emplace_back("adjective_id", word2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("mannernymy", std::move(fields)); + } } } } - } - - // per table - { - std::ifstream wnperfile(wnpref + "wn_per.pl"); - if (!wnperfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnperfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetSpecification() + { + std::list lines(readFile(wordNetPath_ + "wn_sa.pl")); + progress ppgs("Writing specifications...", lines.size()); + for (auto line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); + + std::list fields; + fields.emplace_back("general_id", word1.getId()); + fields.emplace_back("specific_id", word2.getId()); + + db_.insertIntoTable("specification", std::move(fields)); + } } - - lines.push_back(line); } - progress ppgs("Writing pertainyms and mannernyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetSimilarity() { - ppgs.update(); - - std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_sim.pl")); + progress ppgs("Writing adjective similarity...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query; - switch (synset_id_1 / 100000000) - { - case 3: // Adjective + std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - // This is a pertainym, the second word should be a noun - // Technically it can be an adjective but we're ignoring that - if (synset_id_2 / 100000000 != 1) - { - continue; - } - - query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; - - break; + continue; } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); - case 4: // Adverb + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { - // This is a mannernym, the second word should be an adjective - if (synset_id_2 / 100000000 != 3) - { - continue; - } + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; + std::list fields; + fields.emplace_back("adjective_1_id", notion1.getId()); + fields.emplace_back("adjective_2_id", notion2.getId()); - break; + db_.insertIntoTable("similarity", std::move(fields)); } } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) + std::list generator::readFile(std::string path) + { + std::ifstream file(path); + if (!file) { - db_error(ppdb, query); + throw std::invalid_argument("Could not find file " + path); } - - sqlite3_finalize(ppstmt); - } - } - // sa table - { - std::ifstream wnsafile(wnpref + "wn_sa.pl"); - if (!wnsafile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { + std::list lines; std::string line; - if (!getline(wnsafile, line)) - { - break; - } - - if (line.back() == '\r') + while (std::getline(file, line)) { - line.pop_back(); + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); } - lines.push_back(line); + return lines; } - progress ppgs("Writing specifications...", lines.size()); - for (auto line : lines) + part_of_speech generator::partOfSpeechByWnid(int wnid) { - ppgs.update(); - - std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + switch (wnid / 100000000) { - db_error(ppdb, query); + case 1: return part_of_speech::noun; + case 2: return part_of_speech::verb; + case 3: return part_of_speech::adjective; + case 4: return part_of_speech::adverb; + default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); } + } - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + notion& generator::createNotion(part_of_speech partOfSpeech) + { + notions_.emplace_back(partOfSpeech); + + return notions_.back(); + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) + notion& generator::lookupOrCreateNotion(int wnid) + { + if (!notionByWnid_.count(wnid)) { - db_error(ppdb, query); + notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); + notionByWnid_[wnid] = ¬ions_.back(); } - - sqlite3_finalize(ppstmt); - } - } - - // sim table - { - std::ifstream wnsimfile(wnpref + "wn_sim.pl"); - if (!wnsimfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); + + return *notionByWnid_.at(wnid); } - - std::list lines; - for (;;) + + lemma& generator::lookupOrCreateLemma(std::string base_form) { - std::string line; - if (!getline(wnsimfile, line)) + if (!lemmaByBaseForm_.count(base_form)) { - break; + lemmas_.emplace_back(lookupOrCreateForm(base_form)); + lemmaByBaseForm_[base_form] = &lemmas_.back(); } + + return *lemmaByBaseForm_.at(base_form); + } - if (line.back() == '\r') + form& generator::lookupOrCreateForm(std::string text) + { + if (!formByText_.count(text)) { - line.pop_back(); + forms_.emplace_back(text); + formByText_[text] = &forms_.back(); } - lines.push_back(line); + return *formByText_[text]; } - progress ppgs("Writing sense synonyms...", lines.size()); - for (auto line : lines) + template word& generator::createWord(Args&&... args) { - ppgs.update(); + words_.emplace_back(std::forward(args)...); + word& w = words_.back(); - std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); + + if (w.getNotion().hasWnid()) { - continue; + wordsByWnid_[w.getNotion().getWnid()].insert(&w); } - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); + return w; + } + + group& generator::createGroup(xmlNodePtr top) + { + groups_.emplace_back(); + group& grp = groups_.back(); - for (auto mapping1 : wn[synset_id_1]) + xmlChar* key; + + for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) { - for (auto mapping2 : wn[synset_id_2]) + if (!xmlStrcmp(node->name, reinterpret_cast("SUBCLASSES"))) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) { - db_error(ppdb, query); + if (!xmlStrcmp(subclass->name, reinterpret_cast("VNSUBCLASS"))) + { + try + { + group& subgrp = createGroup(subclass); + subgrp.setParent(grp); + } catch (const std::exception& e) + { + key = xmlGetProp(subclass, reinterpret_cast("ID")); + + if (key == nullptr) + { + std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); + } else { + std::string subgroupId(reinterpret_cast(key)); + xmlFree(key); + + std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); + } + } + } } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + } else if (!xmlStrcmp(node->name, reinterpret_cast("MEMBERS"))) + { + for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) { - db_error(ppdb, query); + if (!xmlStrcmp(member->name, reinterpret_cast("MEMBER"))) + { + key = xmlGetProp(member, reinterpret_cast("wn")); + std::string wnSenses(reinterpret_cast(key)); + xmlFree(key); + + auto wnSenseKeys = split>(wnSenses, " "); + if (!wnSenseKeys.empty()) + { + std::list tempKeys; + + std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { + return sense + "::"; + }); + + std::list filteredKeys; + + std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { + return !wnSenseKeys_.count(sense); + }); + + wnSenseKeys = std::move(filteredKeys); + } + + if (!wnSenseKeys.empty()) + { + for (std::string sense : wnSenseKeys) + { + word& wordSense = *wnSenseKeys_[sense]; + wordSense.setVerbGroup(grp); + } + } else { + key = xmlGetProp(member, reinterpret_cast("name")); + std::string memberName(reinterpret_cast(key)); + xmlFree(key); + + notion& n = createNotion(part_of_speech::verb); + lemma& l = lookupOrCreateLemma(memberName); + word& w = createWord(n, l); + + w.setVerbGroup(grp); + } + } } - - sqlite3_reset(ppstmt); - sqlite3_clear_bindings(ppstmt); - - sqlite3_bind_int(ppstmt, 1, mapping2.second); - sqlite3_bind_int(ppstmt, 2, mapping1.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + } else if (!xmlStrcmp(node->name, reinterpret_cast("THEMROLES"))) + { + for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) { - db_error(ppdb, query); + if (!xmlStrcmp(roletopnode->name, reinterpret_cast("THEMROLE"))) + { + role r; + + key = xmlGetProp(roletopnode, reinterpret_cast("type")); + std::string roleName = reinterpret_cast(key); + xmlFree(key); + + for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) + { + if (!xmlStrcmp(rolenode->name, reinterpret_cast("SELRESTRS"))) + { + r.setSelrestrs(parseSelrestr(rolenode)); + } + } + + grp.addRole(roleName, std::move(r)); + } } + } else if (!xmlStrcmp(node->name, reinterpret_cast("FRAMES"))) + { + for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) + { + if (!xmlStrcmp(frametopnode->name, reinterpret_cast("FRAME"))) + { + frames_.emplace_back(); + frame& fr = frames_.back(); - sqlite3_finalize(ppstmt); + for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) + { + if (!xmlStrcmp(framenode->name, reinterpret_cast("SYNTAX"))) + { + for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) + { + if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("NP"))) + { + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string partRole = reinterpret_cast(key); + xmlFree(key); + + selrestr partSelrestrs; + std::set partSynrestrs; + + for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) + { + if (!xmlStrcmp(npnode->name, reinterpret_cast("SYNRESTRS"))) + { + for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) + { + if (!xmlStrcmp(synrestr->name, reinterpret_cast("SYNRESTR"))) + { + key = xmlGetProp(synrestr, reinterpret_cast("type")); + partSynrestrs.insert(reinterpret_cast(key)); + xmlFree(key); + } + } + } + + if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) + { + partSelrestrs = parseSelrestr(npnode); + } + } + + fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("VERB"))) + { + fr.push_back(part::createVerb()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("PREP"))) + { + std::set partChoices; + bool partLiteral; + + if (xmlHasProp(syntaxnode, reinterpret_cast("value"))) + { + partLiteral = true; + + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string choicesStr = reinterpret_cast(key); + xmlFree(key); + + split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); + } else { + partLiteral = false; + + for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) + { + if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) + { + for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) + { + if (!xmlStrcmp(synrestr->name, reinterpret_cast("SELRESTR"))) + { + key = xmlGetProp(synrestr, reinterpret_cast("type")); + partChoices.insert(reinterpret_cast(key)); + xmlFree(key); + } + } + } + } + } + + fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("ADJ"))) + { + fr.push_back(part::createAdjective()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("ADV"))) + { + fr.push_back(part::createAdverb()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("LEX"))) + { + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string literalValue = reinterpret_cast(key); + xmlFree(key); + + fr.push_back(part::createLiteral(literalValue)); + } else { + continue; + } + } + + grp.addFrame(fr); + } + } + } + } } } - } - } - - // syntax table - { - std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); - if (!wnsyntaxfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsyntaxfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); + return grp; } - progress ppgs("Writing adjective syntax markers...", lines.size()); - for (auto line : lines) + selrestr generator::parseSelrestr(xmlNodePtr top) { - ppgs.update(); - - std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string syn = relation_data[3]; - std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + xmlChar* key; + + if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTRS"))) { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (xmlChildElementCount(top) == 0) + { + return {}; + } else if (xmlChildElementCount(top) == 1) + { + return parseSelrestr(xmlFirstElementChild(top)); + } else { + bool orlogic = false; + if (xmlHasProp(top, reinterpret_cast("logic"))) + { + key = xmlGetProp(top, reinterpret_cast("logic")); + if (!xmlStrcmp(key, reinterpret_cast("or"))) + { + orlogic = true; + } + + xmlFree(key); + } + + std::list children; + for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) + { + if (!xmlStrcmp(selrestr->name, reinterpret_cast("SELRESTRS")) + || !xmlStrcmp(selrestr->name, reinterpret_cast("SELRESTR"))) + { + children.push_back(parseSelrestr(selrestr)); + } + } + + return selrestr(children, orlogic); + } + } else if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTR"))) { - db_error(ppdb, query); + key = xmlGetProp(top, reinterpret_cast("Value")); + bool selPos = (std::string(reinterpret_cast(key)) == "+"); + xmlFree(key); + + key = xmlGetProp(top, reinterpret_cast("type")); + std::string selRestriction = reinterpret_cast(key); + xmlFree(key); + + return selrestr(selRestriction, selPos); + } else { + throw std::logic_error("Badly formatted selrestr"); } - - sqlite3_finalize(ppstmt); } - } - - sqlite3_close_v2(ppdb); - - std::cout << "Done." << std::endl; -} + + }; +}; -- cgit 1.4.1