From 8b1333d0e6e2b9a5014bdbff2987d899f5413fee Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Wed, 16 Mar 2016 21:35:35 -0400 Subject: Added word derivational relationships (kind of eh at the moment) and moved verbly into its own directory --- generator.cpp | 1520 --------------------------------------------------------- 1 file changed, 1520 deletions(-) delete mode 100644 generator.cpp (limited to 'generator.cpp') diff --git a/generator.cpp b/generator.cpp deleted file mode 100644 index 305d121..0000000 --- a/generator.cpp +++ /dev/null @@ -1,1520 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "progress.h" - -struct verb { - std::string infinitive; - std::string past_tense; - std::string past_participle; - std::string ing_form; - std::string s_form; -}; - -struct adjective { - std::string base; - std::string comparative; - std::string superlative; -}; - -struct noun { - std::string singular; - std::string plural; -}; - -struct group { - std::string id; - std::set members; -}; - -std::map groups; -std::map verbs; -std::map adjectives; -std::map nouns; -std::map> wn; -std::map> pronunciations; - -void print_usage() -{ - std::cout << "Verbly Datafile Generator" << std::endl; - std::cout << "-------------------------" << std::endl; - std::cout << "Requires exactly six arguments." << std::endl; - std::cout << "1. The path to a VerbNet data directory." << std::endl; - std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; - std::cout << "3. The path to an AGID infl.txt file." << std::endl; - std::cout << "4. The path to a WordNet prolog data directory." << std::endl; - std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; - std::cout << "6. Datafile output path." << std::endl; - - exit(1); -} - -void db_error(sqlite3* ppdb, std::string) -{ - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); -} - -/* -void parse_group(xmlNodePtr top, std::string filename) -{ - xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); - if (key == 0) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - std::string vnid = key; - vnid = vnid.substr(vnid.find_first_of("-")+1); - xmlFree(key); - - group g; - g.id = vnid; - - for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) - { - if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) - { - for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) - { - if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) - { - key = xmlGetProp(member, (xmlChar*) "name"); - g.members.insert(key); - xmlFree(key); - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) - { - for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) - { - if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) - { - for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) - { - - } - } - } - } - } -}*/ - -int main(int argc, char** argv) -{ - if (argc != 7) - { - print_usage(); - } - - /*DIR* dir; - if ((dir = opendir(argv[1])) == nullptr) - { - std::cout << "Invalid VerbNet data directory." << std::endl; - - print_usage(); - } - - struct dirent* ent; - while ((ent = readdir(dir)) != nullptr) - { - std::string filename(argv[1]); - if (filename.back() != '/') - { - filename += '/'; - } - - filename += ent->d_name; - //std::cout << ent->d_name << std::endl; - - if (filename.rfind(".xml") != filename.size() - 4) - { - continue; - } - - xmlDocPtr doc = xmlParseFile(filename.c_str()); - if (doc == nullptr) - { - std::cout << "Error opening " << filename << std::endl; - print_usage(); - } - - xmlNodePtr top = xmlDocGetRootElement(doc); - if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - - parse_group(top, filename); - } - - closedir(dir);*/ - - // Get verbs from AGID - std::cout << "Reading inflections..." << std::endl; - - std::ifstream agidfile(argv[3]); - if (!agidfile.is_open()) - { - std::cout << "Could not open AGID file: " << argv[3] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(agidfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - int divider = line.find_first_of(" "); - std::string word = line.substr(0, divider); - line = line.substr(divider+1); - char type = line[0]; - - if (line[1] == '?') - { - line.erase(0, 4); - } else { - line.erase(0, 3); - } - - std::vector forms; - while (!line.empty()) - { - std::string inflection; - if ((divider = line.find(" | ")) != std::string::npos) - { - inflection = line.substr(0, divider); - line = line.substr(divider + 3); - } else { - inflection = line; - line = ""; - } - - if ((divider = inflection.find_first_of(",?")) != std::string::npos) - { - inflection = inflection.substr(0, divider); - } - - forms.push_back(inflection); - } - - switch (type) - { - case 'V': - { - verb v; - v.infinitive = word; - if (forms.size() == 4) - { - v.past_tense = forms[0]; - v.past_participle = forms[1]; - v.ing_form = forms[2]; - v.s_form = forms[3]; - } else if (forms.size() == 3) - { - v.past_tense = forms[0]; - v.past_participle = forms[0]; - v.ing_form = forms[1]; - v.s_form = forms[2]; - } else if (forms.size() == 8) - { - // As of AGID 2014.08.11, this is only "to be" - v.past_tense = forms[0]; - v.past_participle = forms[2]; - v.ing_form = forms[3]; - v.s_form = forms[4]; - } else { - // Words that don't fit the cases above as of AGID 2014.08.11: - // - may and shall do not conjugate the way we want them to - // - methinks only has a past tense and is an outlier - // - wit has five forms, and is archaic/obscure enough that we can ignore it for now - std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - verbs[word] = v; - - break; - } - - case 'A': - { - adjective adj; - adj.base = word; - if (forms.size() == 2) - { - adj.comparative = forms[0]; - adj.superlative = forms[1]; - } else { - // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" - std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - adjectives[word] = adj; - - break; - } - - case 'N': - { - noun n; - n.singular = word; - if (forms.size() == 1) - { - n.plural = forms[0]; - } else { - // As of AGID 2014.08.11, this is non-existent. - std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - nouns[word] = n; - - break; - } - } - } - - // Pronounciations - std::cout << "Reading pronunciations..." << std::endl; - - std::ifstream pronfile(argv[5]); - if (!pronfile.is_open()) - { - std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(pronfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); - std::smatch phoneme_data; - if (std::regex_search(line, phoneme_data, phoneme)) - { - std::string canonical(phoneme_data[1]); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - - pronunciations[canonical].insert(phoneme_data[2]); - } - } - - // Start writing output - std::cout << "Writing schema..." << std::endl; - - sqlite3* ppdb; - if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) - { - std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; - print_usage(); - } - - std::ifstream schemafile("schema.sql"); - if (!schemafile.is_open()) - { - std::cout << "Could not find schema file" << std::endl; - print_usage(); - } - - std::stringstream schemabuilder; - for (;;) - { - std::string line; - if (!getline(schemafile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - schemabuilder << line << std::endl; - } - - std::string schema = schemabuilder.str(); - while (!schema.empty()) - { - std::string query; - int divider = schema.find(";"); - if (divider != std::string::npos) - { - query = schema.substr(0, divider+1); - schema = schema.substr(divider+2); - } else { - break; - } - - sqlite3_stmt* schmstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(schmstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(schmstmt); - } - - { - progress ppgs("Writing verbs...", verbs.size()); - for (auto& mapping : verbs) - { - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - std::string canonical(mapping.second.infinitive); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) - { - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - - sqlite3_finalize(ppstmt); - - for (auto pronunciation : pronunciations[canonical]) - { - query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - ppgs.update(); - } - } - - // Get nouns/adjectives/adverbs from WordNet - // Useful relations: - // - s: master list - // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) - // - at: variation (e.g. a measurement can be standard or nonstandard) - // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) - // - ins: instantiation (do we need this? let's see) - // - mm: member meronymy/holonymy (e.g. family/mother, family/child) - // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) - // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) - // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) - // mannernymy (e.g. something done quickly is done in a manner that is quick) - // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) - // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) - // - syntax: positioning flags for some adjectives - std::string wnpref {argv[4]}; - if (wnpref.back() != '/') - { - wnpref += '/'; - } - - // s table - { - std::ifstream wnsfile(wnpref + "wn_s.pl"); - if (!wnsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string word = relation_data[3]; - - std::string query; - switch (synset_id / 100000000) - { - case 1: // Noun - { - if (nouns.count(word) == 1) - { - query = "INSERT INTO nouns (singular, plural) VALUES (?, ?)"; - } else { - query = "INSERT INTO nouns (singular) VALUES (?)"; - } - - break; - } - - case 2: // Verb - { - // Ignore - - break; - } - - case 3: // Adjective - { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adjectives (base_form, comparative, superlative) VALUES (?, ?, ?)"; - } else { - query = "INSERT INTO adjectives (base_form) VALUES (?)"; - } - - break; - } - - case 4: // Adverb - { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adverbs (base_form, comparative, superlative) VALUES (?, ?, ?)"; - } else { - query = "INSERT INTO adverbs (base_form) VALUES (?)"; - } - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); - switch (synset_id / 100000000) - { - case 1: // Noun - { - if (nouns.count(word) == 1) - { - sqlite3_bind_text(ppstmt, 2, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); - } - - break; - } - - case 3: // Adjective - case 4: // Adverb - { - if (adjectives.count(word) == 1) - { - sqlite3_bind_text(ppstmt, 2, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 3, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_STATIC); - } - - break; - } - } - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - wn[synset_id][wnum] = rowid; - - sqlite3_finalize(ppstmt); - - std::string canonical(word); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) - { - for (auto pronunciation : pronunciations[canonical]) - { - switch (synset_id / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)"; - - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)"; - - break; - } - } - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // While we're working on s - { - progress ppgs("Writing word synonyms...", wn.size()); - for (auto sense : wn) - { - ppgs.update(); - - for (auto word1 : sense.second) - { - for (auto word2 : sense.second) - { - if (word1 != word2) - { - std::string query; - switch (sense.first / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; - } - - case 2: // Verb - { - // Ignore - - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, word1.second); - sqlite3_bind_int(ppstmt, 2, word2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - } - - // ant table - { - std::ifstream wnantfile(wnpref + "wn_ant.pl"); - if (!wnantfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnantfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing antonyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - - std::string query; - switch (synset_id_1 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; - } - - case 2: // Verb - { - // Ignore - - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - // at table - { - std::ifstream wnatfile(wnpref + "wn_at.pl"); - if (!wnatfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnatfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing variations...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // hyp table - { - std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); - if (!wnhypfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnhypfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing hypernyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // ins table - { - std::ifstream wninsfile(wnpref + "wn_ins.pl"); - if (!wninsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wninsfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing instantiations...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // mm table - { - std::ifstream wnmmfile(wnpref + "wn_mm.pl"); - if (!wnmmfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnmmfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing member meronyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // ms table - { - std::ifstream wnmsfile(wnpref + "wn_ms.pl"); - if (!wnmsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnmsfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing substance meronyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // mm table - { - std::ifstream wnmpfile(wnpref + "wn_mp.pl"); - if (!wnmpfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnmpfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing part meronyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - - // per table - { - std::ifstream wnperfile(wnpref + "wn_per.pl"); - if (!wnperfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnperfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing pertainyms and mannernyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query; - switch (synset_id_1 / 100000000) - { - case 3: // Adjective - { - // This is a pertainym, the second word should be a noun - // Technically it can be an adjective but we're ignoring that - if (synset_id_2 / 100000000 != 1) - { - continue; - } - - query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - // This is a mannernym, the second word should be an adjective - if (synset_id_2 / 100000000 != 3) - { - continue; - } - - query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - // sa table - { - std::ifstream wnsafile(wnpref + "wn_sa.pl"); - if (!wnsafile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsafile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing specifications...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - /* - // sim table - { - std::ifstream wnsimfile(wnpref + "wn_sim.pl"); - if (!wnsimfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsimfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing sense synonyms...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) - { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_reset(ppstmt); - sqlite3_clear_bindings(ppstmt); - - sqlite3_bind_int(ppstmt, 1, mapping2.second); - sqlite3_bind_int(ppstmt, 2, mapping1.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - } - } - */ - // syntax table - { - std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); - if (!wnsyntaxfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsyntaxfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); - } - - progress ppgs("Writing adjective syntax markers...", lines.size()); - for (auto line : lines) - { - ppgs.update(); - - std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string syn = relation_data[3]; - std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_STATIC); - sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - sqlite3_close_v2(ppdb); - - std::cout << "Done." << std::endl; -} -- cgit 1.4.1