From 3aceae8ab1eb5992110ea57a9479bbc3177feb21 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Wed, 16 Mar 2016 11:27:16 -0400 Subject: Added more inflections, word relationships, and pronunciations Nouns, adjectives, and adverbs now have inflected forms. A large number of WordNet word relationships (all noun-noun relationships, plus synonymy and antonymy for all word types except verbs) have been added. Additionally, CMUDICT is now being used to store word pronunciations for rhyming purposes. Verbly is now also a compiled library rather than being header-only due to the complexity of the query interface. --- generator.cpp | 1341 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 1205 insertions(+), 136 deletions(-) (limited to 'generator.cpp') diff --git a/generator.cpp b/generator.cpp index c389963..305d121 100644 --- a/generator.cpp +++ b/generator.cpp @@ -9,6 +9,8 @@ #include #include #include +#include +#include "progress.h" struct verb { std::string infinitive; @@ -18,6 +20,17 @@ struct verb { std::string s_form; }; +struct adjective { + std::string base; + std::string comparative; + std::string superlative; +}; + +struct noun { + std::string singular; + std::string plural; +}; + struct group { std::string id; std::set members; @@ -25,21 +38,33 @@ struct group { std::map groups; std::map verbs; +std::map adjectives; +std::map nouns; std::map> wn; +std::map> pronunciations; void print_usage() { std::cout << "Verbly Datafile Generator" << std::endl; std::cout << "-------------------------" << std::endl; - std::cout << "Requires exactly four arguments." << std::endl; + std::cout << "Requires exactly six arguments." << std::endl; std::cout << "1. The path to a VerbNet data directory." << std::endl; std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; std::cout << "3. The path to an AGID infl.txt file." << std::endl; std::cout << "4. The path to a WordNet prolog data directory." << std::endl; - std::cout << "5. Datafile output path." << std::endl; + std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; + std::cout << "6. Datafile output path." << std::endl; exit(1); } + +void db_error(sqlite3* ppdb, std::string) +{ + std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; + sqlite3_close_v2(ppdb); + print_usage(); +} + /* void parse_group(xmlNodePtr top, std::string filename) { @@ -87,7 +112,7 @@ void parse_group(xmlNodePtr top, std::string filename) int main(int argc, char** argv) { - if (argc != 6) + if (argc != 7) { print_usage(); } @@ -137,7 +162,7 @@ int main(int argc, char** argv) closedir(dir);*/ // Get verbs from AGID - std::cout << "Reading verb inflection..." << std::endl; + std::cout << "Reading inflections..." << std::endl; std::ifstream agidfile(argv[3]); if (!agidfile.is_open()) @@ -162,11 +187,7 @@ int main(int argc, char** argv) int divider = line.find_first_of(" "); std::string word = line.substr(0, divider); line = line.substr(divider+1); - - if (line[0] != 'V') - { - continue; - } + char type = line[0]; if (line[1] == '?') { @@ -174,7 +195,7 @@ int main(int argc, char** argv) } else { line.erase(0, 3); } - + std::vector forms; while (!line.empty()) { @@ -187,52 +208,129 @@ int main(int argc, char** argv) inflection = line; line = ""; } - + if ((divider = inflection.find_first_of(",?")) != std::string::npos) { inflection = inflection.substr(0, divider); } - + forms.push_back(inflection); } - verb v; - v.infinitive = word; - if (forms.size() == 4) + switch (type) { - v.past_tense = forms[0]; - v.past_participle = forms[1]; - v.ing_form = forms[2]; - v.s_form = forms[3]; - } else if (forms.size() == 3) + case 'V': + { + verb v; + v.infinitive = word; + if (forms.size() == 4) + { + v.past_tense = forms[0]; + v.past_participle = forms[1]; + v.ing_form = forms[2]; + v.s_form = forms[3]; + } else if (forms.size() == 3) + { + v.past_tense = forms[0]; + v.past_participle = forms[0]; + v.ing_form = forms[1]; + v.s_form = forms[2]; + } else if (forms.size() == 8) + { + // As of AGID 2014.08.11, this is only "to be" + v.past_tense = forms[0]; + v.past_participle = forms[2]; + v.ing_form = forms[3]; + v.s_form = forms[4]; + } else { + // Words that don't fit the cases above as of AGID 2014.08.11: + // - may and shall do not conjugate the way we want them to + // - methinks only has a past tense and is an outlier + // - wit has five forms, and is archaic/obscure enough that we can ignore it for now + std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; + } + + verbs[word] = v; + + break; + } + + case 'A': + { + adjective adj; + adj.base = word; + if (forms.size() == 2) + { + adj.comparative = forms[0]; + adj.superlative = forms[1]; + } else { + // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" + std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; + } + + adjectives[word] = adj; + + break; + } + + case 'N': + { + noun n; + n.singular = word; + if (forms.size() == 1) + { + n.plural = forms[0]; + } else { + // As of AGID 2014.08.11, this is non-existent. + std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; + } + + nouns[word] = n; + + break; + } + } + } + + // Pronounciations + std::cout << "Reading pronunciations..." << std::endl; + + std::ifstream pronfile(argv[5]); + if (!pronfile.is_open()) + { + std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; + print_usage(); + } + + for (;;) + { + std::string line; + if (!getline(pronfile, line)) { - v.past_tense = forms[0]; - v.past_participle = forms[0]; - v.ing_form = forms[1]; - v.s_form = forms[2]; - } else if (forms.size() == 8) + break; + } + + if (line.back() == '\r') { - // As of AGID 2014.08.11, this is only "to be" - v.past_tense = forms[0]; - v.past_participle = forms[2]; - v.ing_form = forms[3]; - v.s_form = forms[4]; - } else { - // Words that don't fit the cases above as of AGID 2014.08.11: - // - may and shall do not conjugate the way we want them to - // - methinks only has a past tense and is an outlier - // - wit has five forms, and is archaic/obscure enough that we can ignore it for now - std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; + line.pop_back(); } - verbs[word] = v; + std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); + std::smatch phoneme_data; + if (std::regex_search(line, phoneme_data, phoneme)) + { + std::string canonical(phoneme_data[1]); + std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); + + pronunciations[canonical].insert(phoneme_data[2]); + } } // Start writing output - std::cout << "Writing output..." << std::endl; + std::cout << "Writing schema..." << std::endl; sqlite3* ppdb; - if (sqlite3_open_v2(argv[5], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) + if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) { std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; print_usage(); @@ -278,47 +376,82 @@ int main(int argc, char** argv) sqlite3_stmt* schmstmt; if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); + db_error(ppdb, query); } if (sqlite3_step(schmstmt) != SQLITE_DONE) { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); + db_error(ppdb, query); } sqlite3_finalize(schmstmt); } - std::cout << "Writing verbs..." << std::endl; - for (auto& mapping : verbs) { - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + progress ppgs("Writing verbs...", verbs.size()); + for (auto& mapping : verbs) { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); - } + sqlite3_stmt* ppstmt; + std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } - sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); - sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); - } + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + + std::string canonical(mapping.second.infinitive); + std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); + if (pronunciations.count(canonical) == 1) + { + query = "SELECT last_insert_rowid()"; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } - sqlite3_finalize(ppstmt); + if (sqlite3_step(ppstmt) != SQLITE_ROW) + { + db_error(ppdb, query); + } + + int rowid = sqlite3_column_int(ppstmt, 0); + + sqlite3_finalize(ppstmt); + + for (auto pronunciation : pronunciations[canonical]) + { + query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, rowid); + sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + + ppgs.update(); + } } // Get nouns/adjectives/adverbs from WordNet @@ -342,110 +475,1046 @@ int main(int argc, char** argv) wnpref += '/'; } - std::cout << "Reading words from WordNet..." << std::endl; - std::ifstream wnsfile(wnpref + "wn_s.pl"); - if (!wnsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - for (;;) + // s table { - std::string line; - if (!getline(wnsfile, line)) + std::ifstream wnsfile(wnpref + "wn_s.pl"); + if (!wnsfile.is_open()) { - break; + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); } - - if (line.back() == '\r') + + std::list lines; + for (;;) { - line.pop_back(); + std::string line; + if (!getline(wnsfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); } - std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); + + std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string word = relation_data[3]; + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string word = relation_data[3]; - std::string query; - switch (synset_id / 100000000) - { - case 1: // Noun + std::string query; + switch (synset_id / 100000000) { - query = "INSERT INTO nouns (form) VALUES (?)"; + case 1: // Noun + { + if (nouns.count(word) == 1) + { + query = "INSERT INTO nouns (singular, plural) VALUES (?, ?)"; + } else { + query = "INSERT INTO nouns (singular) VALUES (?)"; + } - break; - } + break; + } + + case 2: // Verb + { + // Ignore + + break; + } + + case 3: // Adjective + { + if (adjectives.count(word) == 1) + { + query = "INSERT INTO adjectives (base_form, comparative, superlative) VALUES (?, ?, ?)"; + } else { + query = "INSERT INTO adjectives (base_form) VALUES (?)"; + } + + break; + } - case 2: // Verb + case 4: // Adverb + { + if (adjectives.count(word) == 1) + { + query = "INSERT INTO adverbs (base_form, comparative, superlative) VALUES (?, ?, ?)"; + } else { + query = "INSERT INTO adverbs (base_form) VALUES (?)"; + } + + break; + } + } + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); + switch (synset_id / 100000000) { - // Ignore + case 1: // Noun + { + if (nouns.count(word) == 1) + { + sqlite3_bind_text(ppstmt, 2, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); + } + + break; + } - break; + case 3: // Adjective + case 4: // Adverb + { + if (adjectives.count(word) == 1) + { + sqlite3_bind_text(ppstmt, 2, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_STATIC); + sqlite3_bind_text(ppstmt, 3, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_STATIC); + } + + break; + } + } + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + + query = "SELECT last_insert_rowid()"; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + if (sqlite3_step(ppstmt) != SQLITE_ROW) + { + db_error(ppdb, query); } + + int rowid = sqlite3_column_int(ppstmt, 0); + wn[synset_id][wnum] = rowid; + + sqlite3_finalize(ppstmt); - case 3: // Adjective + std::string canonical(word); + std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); + if (pronunciations.count(canonical) == 1) { - query = "INSERT INTO adjectives (form) VALUES (?)"; + for (auto pronunciation : pronunciations[canonical]) + { + switch (synset_id / 100000000) + { + case 1: // Noun + { + query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)"; + + break; + } + + case 3: // Adjective + { + query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)"; + + break; + } + + case 4: // Adverb + { + query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)"; + + break; + } + } + + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } - break; + sqlite3_bind_int(ppstmt, 1, rowid); + sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } } + } + } + + // While we're working on s + { + progress ppgs("Writing word synonyms...", wn.size()); + for (auto sense : wn) + { + ppgs.update(); - case 4: // Adverb + for (auto word1 : sense.second) { - query = "INSERT INTO adverbs (form) VALUES (?)"; + for (auto word2 : sense.second) + { + if (word1 != word2) + { + std::string query; + switch (sense.first / 100000000) + { + case 1: // Noun + { + query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - break; + break; + } + + case 2: // Verb + { + // Ignore + + break; + } + + case 3: // Adjective + { + query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; + + break; + } + + case 4: // Adverb + { + query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; + + break; + } + } + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, word1.second); + sqlite3_bind_int(ppstmt, 2, word2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } } } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + } + + // ant table + { + std::ifstream wnantfile(wnpref + "wn_ant.pl"); + if (!wnantfile.is_open()) { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); + std::cout << "Invalid WordNet data directory." << std::endl; print_usage(); } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnantfile, line)) + { + break; + } - sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) + progress ppgs("Writing antonyms...", lines.size()); + for (auto line : lines) { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); - } + ppgs.update(); + + std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } - sqlite3_finalize(ppstmt); + int synset_id_1 = stoi(relation_data[1]); + int wnum_1 = stoi(relation_data[2]); + int synset_id_2 = stoi(relation_data[3]); + int wnum_2 = stoi(relation_data[4]); - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); - } + std::string query; + switch (synset_id_1 / 100000000) + { + case 1: // Noun + { + query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; + + break; + } + + case 2: // Verb + { + // Ignore + + break; + } + + case 3: // Adjective + { + query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; + + break; + } + + case 4: // Adverb + { + query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; + + break; + } + } - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); - } + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } - wn[synset_id][wnum] = sqlite3_column_int(ppstmt, 0); + sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); + sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - sqlite3_finalize(ppstmt); + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } } - sqlite3_close_v2(ppdb); - - std::cout << "Done." << std::endl; -} \ No newline at end of file + // at table + { + std::ifstream wnatfile(wnpref + "wn_at.pl"); + if (!wnatfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnatfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing variations...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // hyp table + { + std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); + if (!wnhypfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnhypfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing hypernyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // ins table + { + std::ifstream wninsfile(wnpref + "wn_ins.pl"); + if (!wninsfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wninsfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing instantiations...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // mm table + { + std::ifstream wnmmfile(wnpref + "wn_mm.pl"); + if (!wnmmfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnmmfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing member meronyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // ms table + { + std::ifstream wnmsfile(wnpref + "wn_ms.pl"); + if (!wnmsfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnmsfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing substance meronyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // mm table + { + std::ifstream wnmpfile(wnpref + "wn_mp.pl"); + if (!wnmpfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnmpfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing part meronyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + + // per table + { + std::ifstream wnperfile(wnpref + "wn_per.pl"); + if (!wnperfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnperfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing pertainyms and mannernyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int wnum_1 = stoi(relation_data[2]); + int synset_id_2 = stoi(relation_data[3]); + int wnum_2 = stoi(relation_data[4]); + std::string query; + switch (synset_id_1 / 100000000) + { + case 3: // Adjective + { + // This is a pertainym, the second word should be a noun + // Technically it can be an adjective but we're ignoring that + if (synset_id_2 / 100000000 != 1) + { + continue; + } + + query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; + + break; + } + + case 4: // Adverb + { + // This is a mannernym, the second word should be an adjective + if (synset_id_2 / 100000000 != 3) + { + continue; + } + + query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; + + break; + } + } + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); + sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + + // sa table + { + std::ifstream wnsafile(wnpref + "wn_sa.pl"); + if (!wnsafile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnsafile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing specifications...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int wnum_1 = stoi(relation_data[2]); + int synset_id_2 = stoi(relation_data[3]); + int wnum_2 = stoi(relation_data[4]); + std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); + sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + /* + // sim table + { + std::ifstream wnsimfile(wnpref + "wn_sim.pl"); + if (!wnsimfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnsimfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing sense synonyms...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id_1 = stoi(relation_data[1]); + int synset_id_2 = stoi(relation_data[2]); + std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); + + for (auto mapping1 : wn[synset_id_1]) + { + for (auto mapping2 : wn[synset_id_2]) + { + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_int(ppstmt, 1, mapping1.second); + sqlite3_bind_int(ppstmt, 2, mapping2.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_reset(ppstmt); + sqlite3_clear_bindings(ppstmt); + + sqlite3_bind_int(ppstmt, 1, mapping2.second); + sqlite3_bind_int(ppstmt, 2, mapping1.second); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + } + } + */ + // syntax table + { + std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); + if (!wnsyntaxfile.is_open()) + { + std::cout << "Invalid WordNet data directory." << std::endl; + print_usage(); + } + + std::list lines; + for (;;) + { + std::string line; + if (!getline(wnsyntaxfile, line)) + { + break; + } + + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); + } + + progress ppgs("Writing adjective syntax markers...", lines.size()); + for (auto line : lines) + { + ppgs.update(); + + std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string syn = relation_data[3]; + std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); + + sqlite3_stmt* ppstmt; + if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + { + db_error(ppdb, query); + } + + sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_STATIC); + sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); + + if (sqlite3_step(ppstmt) != SQLITE_DONE) + { + db_error(ppdb, query); + } + + sqlite3_finalize(ppstmt); + } + } + + sqlite3_close_v2(ppdb); + + std::cout << "Done." << std::endl; +} -- cgit 1.4.1