From 2c5ed155e8951f7f28b82ed7570295d5629f4770 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sun, 22 Jan 2017 18:43:25 -0500 Subject: Fixed generator ignoring multiple inflection variants Previously, the generator would recognize at most one form per inflection per lemma; now, the generator adds all variants in AGID to the database. --- generator/generator.cpp | 525 ++++++++++++++++++++++++------------------------ 1 file changed, 268 insertions(+), 257 deletions(-) diff --git a/generator/generator.cpp b/generator/generator.cpp index 3dd2ce7..610a602 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -15,7 +15,7 @@ namespace verbly { namespace generator { - + generator::generator( std::string verbNetPath, std::string agidPath, @@ -36,21 +36,21 @@ namespace verbly { { throw std::invalid_argument("Invalid VerbNet data directory"); } - + closedir(dir); - + // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { throw std::invalid_argument("AGID infl.txt file not found"); } - + // Add directory separator to WordNet path if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { wordNetPath_ += '/'; } - + // Ensure WordNet tables exist for (std::string table : { "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" @@ -61,37 +61,37 @@ namespace verbly { throw std::invalid_argument("WordNet " + table + " table not found"); } } - + // Ensure CMUDICT file exists if (!std::ifstream(cmudictPath_)) { throw std::invalid_argument("CMUDICT file not found"); } - + // Ensure ImageNet urls.txt exists if (!std::ifstream(imageNetPath_)) { throw std::invalid_argument("ImageNet urls.txt file not found"); } } - + void generator::run() { // Create notions, words, lemmas, and forms from WordNet synsets readWordNetSynsets(); - + // Reads adjective positioning WordNet data readAdjectivePositioning(); - - // Counts the number of URLs ImageNet has per notion + + // Counts the number of URLs ImageNet has per notion readImageNetUrls(); - + // Creates a word by WordNet sense key lookup table readWordNetSenseKeys(); - + // Creates groups and frames from VerbNet data readVerbNet(); - + // Creates forms and inflections from AGID. To reduce the amount of forms // created, we do this after most lemmas that need inflecting have been // created through other means, and then only generate forms for @@ -101,86 +101,78 @@ namespace verbly { // then a notion and a word is generated and the form generation proceeds // as usual. readAgidInflections(); - + // Reads in prepositions and the is_a relationship readPrepositions(); - + // Creates pronunciations from CMUDICT. To reduce the amount of // pronunciations created, we do this after all forms have been created, // and then only generate pronunciations for already-exisiting forms. readCmudictPronunciations(); - + // Writes the database schema writeSchema(); - + // Dumps data to the database dumpObjects(); - + // Populates the antonymy relationship from WordNet readWordNetAntonymy(); - + // Populates the variation relationship from WordNet readWordNetVariation(); - + // Populates the usage, topicality, and regionality relationships from // WordNet readWordNetClasses(); - + // Populates the causality relationship from WordNet readWordNetCausality(); - + // Populates the entailment relationship from WordNet readWordNetEntailment(); - + // Populates the hypernymy relationship from WordNet readWordNetHypernymy(); - + // Populates the instantiation relationship from WordNet readWordNetInstantiation(); - + // Populates the member meronymy relationship from WordNet readWordNetMemberMeronymy(); - + // Populates the part meronymy relationship from WordNet readWordNetPartMeronymy(); - + // Populates the substance meronymy relationship from WordNet readWordNetSubstanceMeronymy(); - + // Populates the pertainymy and mannernymy relationships from WordNet readWordNetPertainymy(); - + // Populates the specification relationship from WordNet readWordNetSpecification(); - + // Populates the adjective similarity relationship from WordNet readWordNetSimilarity(); - - - - - - - - } - + void generator::readWordNetSynsets() { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); progress ppgs("Reading synsets from WordNet...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } - + int synset_id = std::stoi(relation_data[1]); int wnum = std::stoi(relation_data[2]); std::string text = relation_data[3]; @@ -190,7 +182,7 @@ namespace verbly { { text.erase(word_it, 1); } - + // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); @@ -204,32 +196,32 @@ namespace verbly { } } } - + void generator::readAdjectivePositioning() { std::list lines(readFile(wordNetPath_ + "wn_syntax.pl")); progress ppgs("Reading adjective positionings from WordNet...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } - + int synset_id = stoi(relation_data[1]); int wnum = stoi(relation_data[2]); std::string adjpos_str = relation_data[3]; - + std::pair lookup(synset_id, wnum); if (wordByWnidAndWnum_.count(lookup)) { word& adj = *wordByWnidAndWnum_.at(lookup); - + if (adjpos_str == "p") { adj.setAdjectivePosition(positioning::predicate); @@ -246,20 +238,20 @@ namespace verbly { } } } - + void generator::readImageNetUrls() { // The ImageNet datafile is so large that it is unreasonable and // unnecessary to read it into memory; instead, we will parse each line as // we read it. This has the caveat that we cannot display a progress bar. std::cout << "Reading image counts from ImageNet..." << std::endl; - + std::ifstream file(imageNetPath_); if (!file) { throw std::invalid_argument("Could not find file " + imageNetPath_); } - + std::string line; while (std::getline(file, line)) { @@ -267,7 +259,7 @@ namespace verbly { { line.pop_back(); } - + std::string wnid_s = line.substr(1, 8); int wnid = stoi(wnid_s) + 100000000; if (notionByWnid_.count(wnid)) @@ -277,16 +269,16 @@ namespace verbly { } } } - + void generator::readWordNetSenseKeys() { std::list lines(readFile(wordNetPath_ + "wn_sk.pl")); progress ppgs("Reading sense keys from WordNet...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + // We only actually need to lookup verbs by sense key so we'll just // ignore everything that isn't a verb. std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); @@ -295,11 +287,11 @@ namespace verbly { { continue; } - + int synset_id = stoi(relation_data[1]); int wnum = stoi(relation_data[2]); std::string sense_key = relation_data[3]; - + // We are treating this mapping as injective, which is not entirely // accurate. First, the WordNet table contains duplicate rows, so those // need to be ignored. More importantly, a small number of sense keys @@ -317,17 +309,17 @@ namespace verbly { } } } - + void generator::readVerbNet() { std::cout << "Reading frames from VerbNet..." << std::endl; - + DIR* dir; if ((dir = opendir(verbNetPath_.c_str())) == nullptr) { throw std::invalid_argument("Invalid VerbNet data directory"); } - + struct dirent* ent; while ((ent = readdir(dir)) != nullptr) { @@ -337,20 +329,20 @@ namespace verbly { { filename += '/'; } - + filename += ent->d_name; - + if (filename.rfind(".xml") != filename.size() - 4) { continue; } - + xmlDocPtr doc = xmlParseFile(filename.c_str()); if (doc == nullptr) { throw std::logic_error("Error opening " + filename); } - + xmlNodePtr top = xmlDocGetRootElement(doc); if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast("VNCLASS")))) { @@ -365,71 +357,81 @@ namespace verbly { std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); } } - + closedir(dir); } - + void generator::readAgidInflections() { std::list lines(readFile(agidPath_)); progress ppgs("Reading inflections from AGID...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + int divider = line.find_first_of(" "); std::string infinitive = line.substr(0, divider); line = line.substr(divider+1); char type = line[0]; - + if (line[1] == '?') { line.erase(0, 4); } else { line.erase(0, 3); } - + if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) { continue; } - + lemma& curLemma = lookupOrCreateLemma(infinitive); - auto forms = split>(line, " | "); - for (std::string& inflForm : forms) + std::vector> agidForms; + for (std::string inflForms : split>(line, " | ")) { - int sympos = inflForm.find_first_of(",?"); - if (sympos != std::string::npos) + std::list forms; + + for (std::string inflForm : split>(std::move(inflForms), ", ")) { - inflForm = inflForm.substr(0, sympos); + int sympos = inflForm.find_first_of("~> mappedForms; switch (type) { case 'V': { - if (forms.size() == 4) + if (agidForms.size() == 4) { - curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); - curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); - curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); - curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); - } else if (forms.size() == 3) + mappedForms[inflection::past_tense] = agidForms[0]; + mappedForms[inflection::past_participle] = agidForms[1]; + mappedForms[inflection::ing_form] = agidForms[2]; + mappedForms[inflection::s_form] = agidForms[3]; + } else if (agidForms.size() == 3) { - curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); - curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); - curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); - curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); - } else if (forms.size() == 8) + mappedForms[inflection::past_tense] = agidForms[0]; + mappedForms[inflection::past_participle] = agidForms[0]; + mappedForms[inflection::ing_form] = agidForms[1]; + mappedForms[inflection::s_form] = agidForms[2]; + } else if (agidForms.size() == 8) { // As of AGID 2014.08.11, this is only "to be" - curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); - curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); - curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); - curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); + mappedForms[inflection::past_tense] = agidForms[0]; + mappedForms[inflection::past_participle] = agidForms[2]; + mappedForms[inflection::ing_form] = agidForms[3]; + mappedForms[inflection::s_form] = agidForms[4]; } else { // Words that don't fit the cases above as of AGID 2014.08.11: // - may and shall do not conjugate the way we want them to @@ -437,7 +439,7 @@ namespace verbly { // - wit has five forms, and is archaic/obscure enough that we can ignore it for now std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } - + // For verbs in particular, we sometimes create a notion and a word // from inflection data. Specifically, if there are not yet any // verbs existing that have the same infinitive form. "Yet" means @@ -451,84 +453,93 @@ namespace verbly { notion& n = createNotion(part_of_speech::verb); createWord(n, curLemma); } - + break; } - + case 'A': { - if (forms.size() == 2) + if (agidForms.size() == 2) { - curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); - curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); + mappedForms[inflection::comparative] = agidForms[0]; + mappedForms[inflection::superlative] = agidForms[1]; } else { // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } - + break; } - + case 'N': { - if (forms.size() == 1) + if (agidForms.size() == 1) { - curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); + mappedForms[inflection::plural] = agidForms[0]; } else { // As of AGID 2014.08.11, this is non-existent. std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } - + break; } } + + // Compile the forms we have mapped. + for (auto mapping : std::move(mappedForms)) + { + for (std::string infl : std::move(mapping.second)) + { + curLemma.addInflection(mapping.first, lookupOrCreateForm(std::move(infl))); + } + } } } - + void generator::readPrepositions() { std::list lines(readFile("prepositions.txt")); progress ppgs("Reading prepositions...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + std::regex relation("^([^:]+): (.+)"); std::smatch relation_data; std::regex_search(line, relation_data, relation); std::string prep = relation_data[1]; auto groups = split>(relation_data[2], ", "); - + notion& n = createNotion(part_of_speech::preposition); lemma& l = lookupOrCreateLemma(prep); word& w = createWord(n, l); - + n.setPrepositionGroups(groups); } } - + void generator::readCmudictPronunciations() { std::list lines(readFile(cmudictPath_)); progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); - + for (std::string line : lines) { ppgs.update(); - + std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); std::smatch phoneme_data; if (std::regex_search(line, phoneme_data, phoneme)) { std::string canonical(phoneme_data[1]); std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - + if (!formByText_.count(canonical)) { continue; } - + std::string phonemes = phoneme_data[2]; pronunciations_.emplace_back(phonemes); pronunciation& p = pronunciations_.back(); @@ -536,7 +547,7 @@ namespace verbly { } } } - + void generator::writeSchema() { std::ifstream file("schema.sql"); @@ -544,7 +555,7 @@ namespace verbly { { throw std::invalid_argument("Could not find database schema"); } - + std::ostringstream schemaBuilder; std::string line; while (std::getline(file, line)) @@ -553,10 +564,10 @@ namespace verbly { { line.pop_back(); } - + schemaBuilder << line; } - + std::string schema = schemaBuilder.str(); auto queries = split>(schema, ";"); progress ppgs("Writing database schema...", queries.size()); @@ -566,91 +577,91 @@ namespace verbly { { db_.runQuery(query); } - + ppgs.update(); } } - + void generator::dumpObjects() { { progress ppgs("Writing notions...", notions_.size()); - + for (notion& n : notions_) { db_ << n; - + ppgs.update(); } } - + { progress ppgs("Writing words...", words_.size()); - + for (word& w : words_) { db_ << w; - + ppgs.update(); } } - + { progress ppgs("Writing lemmas...", lemmas_.size()); - + for (lemma& l : lemmas_) { db_ << l; - + ppgs.update(); } } - + { progress ppgs("Writing forms...", forms_.size()); - + for (form& f : forms_) { db_ << f; - + ppgs.update(); } } - + { progress ppgs("Writing pronunciations...", pronunciations_.size()); - + for (pronunciation& p : pronunciations_) { db_ << p; - + ppgs.update(); } } - + { progress ppgs("Writing verb groups...", groups_.size()); - + for (group& g : groups_) { db_ << g; - + ppgs.update(); } } - + { progress ppgs("Writing verb frames...", frames_.size()); - + for (frame& f : frames_) { db_ << f; - + ppgs.update(); } } } - + void generator::readWordNetAntonymy() { std::list lines(readFile(wordNetPath_ + "wn_ant.pl")); @@ -658,7 +669,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -668,21 +679,21 @@ namespace verbly { std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); - + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) { word& word1 = *wordByWnidAndWnum_.at(lookup1); word& word2 = *wordByWnidAndWnum_.at(lookup2); - + std::list fields; fields.emplace_back("antonym_1_id", word1.getId()); fields.emplace_back("antonym_2_id", word2.getId()); - + db_.insertIntoTable("antonymy", std::move(fields)); } } } - + void generator::readWordNetVariation() { std::list lines(readFile(wordNetPath_ + "wn_at.pl")); @@ -690,7 +701,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -700,21 +711,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("noun_id", notion1.getId()); fields.emplace_back("adjective_id", notion2.getId()); - + db_.insertIntoTable("variation", std::move(fields)); } } } - + void generator::readWordNetClasses() { std::list lines(readFile(wordNetPath_ + "wn_cls.pl")); @@ -722,7 +733,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -733,7 +744,7 @@ namespace verbly { std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); std::string class_type = relation_data[5]; - + std::string table_name; if (class_type == "t") { @@ -745,10 +756,10 @@ namespace verbly { { table_name += "regionality"; } - + std::list leftJoin; std::list rightJoin; - + if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) { std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { @@ -757,7 +768,7 @@ namespace verbly { } else if (wordByWnidAndWnum_.count(lookup1)) { leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); } - + if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) { std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { @@ -766,7 +777,7 @@ namespace verbly { } else if (wordByWnidAndWnum_.count(lookup2)) { rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); } - + for (int word1 : leftJoin) { for (int word2 : rightJoin) @@ -774,13 +785,13 @@ namespace verbly { std::list fields; fields.emplace_back("term_id", word1); fields.emplace_back("domain_id", word2); - + db_.insertIntoTable(table_name, std::move(fields)); } } } } - + void generator::readWordNetCausality() { std::list lines(readFile(wordNetPath_ + "wn_cs.pl")); @@ -788,7 +799,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -798,21 +809,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("effect_id", notion1.getId()); fields.emplace_back("cause_id", notion2.getId()); - + db_.insertIntoTable("causality", std::move(fields)); } } } - + void generator::readWordNetEntailment() { std::list lines(readFile(wordNetPath_ + "wn_ent.pl")); @@ -820,7 +831,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -830,21 +841,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("given_id", notion1.getId()); fields.emplace_back("entailment_id", notion2.getId()); - + db_.insertIntoTable("entailment", std::move(fields)); } } } - + void generator::readWordNetHypernymy() { std::list lines(readFile(wordNetPath_ + "wn_hyp.pl")); @@ -852,7 +863,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -862,21 +873,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("hyponym_id", notion1.getId()); fields.emplace_back("hypernym_id", notion2.getId()); - + db_.insertIntoTable("hypernymy", std::move(fields)); } } } - + void generator::readWordNetInstantiation() { std::list lines(readFile(wordNetPath_ + "wn_ins.pl")); @@ -884,7 +895,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -894,21 +905,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("instance_id", notion1.getId()); fields.emplace_back("class_id", notion2.getId()); - + db_.insertIntoTable("instantiation", std::move(fields)); } } } - + void generator::readWordNetMemberMeronymy() { std::list lines(readFile(wordNetPath_ + "wn_mm.pl")); @@ -916,7 +927,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -926,21 +937,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("holonym_id", notion1.getId()); fields.emplace_back("meronym_id", notion2.getId()); - + db_.insertIntoTable("member_meronymy", std::move(fields)); } } } - + void generator::readWordNetPartMeronymy() { std::list lines(readFile(wordNetPath_ + "wn_mp.pl")); @@ -948,7 +959,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -958,21 +969,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("holonym_id", notion1.getId()); fields.emplace_back("meronym_id", notion2.getId()); - + db_.insertIntoTable("part_meronymy", std::move(fields)); } } } - + void generator::readWordNetSubstanceMeronymy() { std::list lines(readFile(wordNetPath_ + "wn_ms.pl")); @@ -980,7 +991,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -990,21 +1001,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("holonym_id", notion1.getId()); fields.emplace_back("meronym_id", notion2.getId()); - + db_.insertIntoTable("substance_meronymy", std::move(fields)); } } } - + void generator::readWordNetPertainymy() { std::list lines(readFile(wordNetPath_ + "wn_per.pl")); @@ -1012,7 +1023,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -1022,31 +1033,31 @@ namespace verbly { std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); - + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) { word& word1 = *wordByWnidAndWnum_.at(lookup1); word& word2 = *wordByWnidAndWnum_.at(lookup2); - + if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) { std::list fields; fields.emplace_back("pertainym_id", word1.getId()); fields.emplace_back("noun_id", word2.getId()); - + db_.insertIntoTable("pertainymy", std::move(fields)); } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) { std::list fields; fields.emplace_back("mannernym_id", word1.getId()); fields.emplace_back("adjective_id", word2.getId()); - + db_.insertIntoTable("mannernymy", std::move(fields)); } } } } - + void generator::readWordNetSpecification() { std::list lines(readFile(wordNetPath_ + "wn_sa.pl")); @@ -1064,21 +1075,21 @@ namespace verbly { std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); - + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) { word& word1 = *wordByWnidAndWnum_.at(lookup1); word& word2 = *wordByWnidAndWnum_.at(lookup2); - + std::list fields; fields.emplace_back("general_id", word1.getId()); fields.emplace_back("specific_id", word2.getId()); - + db_.insertIntoTable("specification", std::move(fields)); } } } - + void generator::readWordNetSimilarity() { std::list lines(readFile(wordNetPath_ + "wn_sim.pl")); @@ -1086,7 +1097,7 @@ namespace verbly { for (auto line : lines) { ppgs.update(); - + std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) @@ -1096,21 +1107,21 @@ namespace verbly { int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); - + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { notion& notion1 = *notionByWnid_.at(lookup1); notion& notion2 = *notionByWnid_.at(lookup2); - + std::list fields; fields.emplace_back("adjective_1_id", notion1.getId()); fields.emplace_back("adjective_2_id", notion2.getId()); - + db_.insertIntoTable("similarity", std::move(fields)); } } } - + std::list generator::readFile(std::string path) { std::ifstream file(path); @@ -1118,7 +1129,7 @@ namespace verbly { { throw std::invalid_argument("Could not find file " + path); } - + std::list lines; std::string line; while (std::getline(file, line)) @@ -1127,13 +1138,13 @@ namespace verbly { { line.pop_back(); } - + lines.push_back(line); } - + return lines; } - + part_of_speech generator::partOfSpeechByWnid(int wnid) { switch (wnid / 100000000) @@ -1145,14 +1156,14 @@ namespace verbly { default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); } } - + notion& generator::createNotion(part_of_speech partOfSpeech) { notions_.emplace_back(partOfSpeech); - + return notions_.back(); } - + notion& generator::lookupOrCreateNotion(int wnid) { if (!notionByWnid_.count(wnid)) @@ -1160,10 +1171,10 @@ namespace verbly { notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); notionByWnid_[wnid] = ¬ions_.back(); } - + return *notionByWnid_.at(wnid); } - + lemma& generator::lookupOrCreateLemma(std::string base_form) { if (!lemmaByBaseForm_.count(base_form)) @@ -1171,10 +1182,10 @@ namespace verbly { lemmas_.emplace_back(lookupOrCreateForm(base_form)); lemmaByBaseForm_[base_form] = &lemmas_.back(); } - + return *lemmaByBaseForm_.at(base_form); } - + form& generator::lookupOrCreateForm(std::string text) { if (!formByText_.count(text)) @@ -1182,32 +1193,32 @@ namespace verbly { forms_.emplace_back(text); formByText_[text] = &forms_.back(); } - + return *formByText_[text]; } - + template word& generator::createWord(Args&&... args) { words_.emplace_back(std::forward(args)...); word& w = words_.back(); - + wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); - + if (w.getNotion().hasWnid()) { wordsByWnid_[w.getNotion().getWnid()].insert(&w); } - + return w; } - + group& generator::createGroup(xmlNodePtr top) { groups_.emplace_back(); group& grp = groups_.back(); - + xmlChar* key; - + for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) { if (!xmlStrcmp(node->name, reinterpret_cast("SUBCLASSES"))) @@ -1223,14 +1234,14 @@ namespace verbly { } catch (const std::exception& e) { key = xmlGetProp(subclass, reinterpret_cast("ID")); - + if (key == nullptr) { std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); } else { std::string subgroupId(reinterpret_cast(key)); xmlFree(key); - + std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); } } @@ -1245,25 +1256,25 @@ namespace verbly { key = xmlGetProp(member, reinterpret_cast("wn")); std::string wnSenses(reinterpret_cast(key)); xmlFree(key); - + auto wnSenseKeys = split>(wnSenses, " "); if (!wnSenseKeys.empty()) { std::list tempKeys; - + std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { return sense + "::"; }); - + std::list filteredKeys; - + std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { return !wnSenseKeys_.count(sense); }); - + wnSenseKeys = std::move(filteredKeys); } - + if (!wnSenseKeys.empty()) { for (std::string sense : wnSenseKeys) @@ -1275,11 +1286,11 @@ namespace verbly { key = xmlGetProp(member, reinterpret_cast("name")); std::string memberName(reinterpret_cast(key)); xmlFree(key); - + notion& n = createNotion(part_of_speech::verb); lemma& l = lookupOrCreateLemma(memberName); word& w = createWord(n, l); - + w.setVerbGroup(grp); } } @@ -1293,7 +1304,7 @@ namespace verbly { key = xmlGetProp(roletopnode, reinterpret_cast("type")); std::string roleName = reinterpret_cast(key); xmlFree(key); - + selrestr roleSelrestrs; for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) { @@ -1314,19 +1325,19 @@ namespace verbly { { frames_.emplace_back(); frame& fr = frames_.back(); - + for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) { if (!xmlStrcmp(framenode->name, reinterpret_cast("SYNTAX"))) { for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) - { + { if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("NP"))) { key = xmlGetProp(syntaxnode, reinterpret_cast("value")); std::string partRole = reinterpret_cast(key); xmlFree(key); - + selrestr partSelrestrs; std::set partSynrestrs; @@ -1344,13 +1355,13 @@ namespace verbly { } } } - + if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) { partSelrestrs = parseSelrestr(npnode); } } - + fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("VERB"))) { @@ -1359,11 +1370,11 @@ namespace verbly { { std::set partChoices; bool partLiteral; - + if (xmlHasProp(syntaxnode, reinterpret_cast("value"))) { partLiteral = true; - + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); std::string choicesStr = reinterpret_cast(key); xmlFree(key); @@ -1380,7 +1391,7 @@ namespace verbly { } } else { partLiteral = false; - + for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) { if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) @@ -1397,7 +1408,7 @@ namespace verbly { } } } - + fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("ADJ"))) { @@ -1410,7 +1421,7 @@ namespace verbly { key = xmlGetProp(syntaxnode, reinterpret_cast("value")); std::string literalValue = reinterpret_cast(key); xmlFree(key); - + fr.push_back(part::createLiteral(literalValue)); } else { continue; @@ -1427,11 +1438,11 @@ namespace verbly { return grp; } - + selrestr generator::parseSelrestr(xmlNodePtr top) { xmlChar* key; - + if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTRS"))) { if (xmlChildElementCount(top) == 0) @@ -1449,10 +1460,10 @@ namespace verbly { { orlogic = true; } - + xmlFree(key); } - + std::list children; for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) { @@ -1462,7 +1473,7 @@ namespace verbly { children.push_back(parseSelrestr(selrestr)); } } - + return selrestr(children, orlogic); } } else if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTR"))) @@ -1474,12 +1485,12 @@ namespace verbly { key = xmlGetProp(top, reinterpret_cast("type")); std::string selRestriction = reinterpret_cast(key); xmlFree(key); - + return selrestr(selRestriction, selPos); } else { throw std::logic_error("Badly formatted selrestr"); } } - + }; }; -- cgit 1.4.1