diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-16 11:27:16 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-16 11:27:16 -0400 |
commit | 3aceae8ab1eb5992110ea57a9479bbc3177feb21 (patch) | |
tree | 13167a266805344efb7bb1d900486f782c23285e /generator.cpp | |
parent | e1be2716746e75cf6ed37e86461a7f580a964564 (diff) | |
download | furries-3aceae8ab1eb5992110ea57a9479bbc3177feb21.tar.gz furries-3aceae8ab1eb5992110ea57a9479bbc3177feb21.tar.bz2 furries-3aceae8ab1eb5992110ea57a9479bbc3177feb21.zip |
Added more inflections, word relationships, and pronunciations
Nouns, adjectives, and adverbs now have inflected forms. A large number of WordNet word relationships (all noun-noun relationships, plus synonymy and antonymy for all word types except verbs) have been added. Additionally, CMUDICT is now being used to store word pronunciations for rhyming purposes. Verbly is now also a compiled library rather than being header-only due to the complexity of the query interface.
Diffstat (limited to 'generator.cpp')
-rw-r--r-- | generator.cpp | 1303 |
1 files changed, 1186 insertions, 117 deletions
diff --git a/generator.cpp b/generator.cpp index c389963..305d121 100644 --- a/generator.cpp +++ b/generator.cpp | |||
@@ -9,6 +9,8 @@ | |||
9 | #include <sqlite3.h> | 9 | #include <sqlite3.h> |
10 | #include <sstream> | 10 | #include <sstream> |
11 | #include <regex> | 11 | #include <regex> |
12 | #include <list> | ||
13 | #include "progress.h" | ||
12 | 14 | ||
13 | struct verb { | 15 | struct verb { |
14 | std::string infinitive; | 16 | std::string infinitive; |
@@ -18,6 +20,17 @@ struct verb { | |||
18 | std::string s_form; | 20 | std::string s_form; |
19 | }; | 21 | }; |
20 | 22 | ||
23 | struct adjective { | ||
24 | std::string base; | ||
25 | std::string comparative; | ||
26 | std::string superlative; | ||
27 | }; | ||
28 | |||
29 | struct noun { | ||
30 | std::string singular; | ||
31 | std::string plural; | ||
32 | }; | ||
33 | |||
21 | struct group { | 34 | struct group { |
22 | std::string id; | 35 | std::string id; |
23 | std::set<std::string> members; | 36 | std::set<std::string> members; |
@@ -25,21 +38,33 @@ struct group { | |||
25 | 38 | ||
26 | std::map<std::string, group> groups; | 39 | std::map<std::string, group> groups; |
27 | std::map<std::string, verb> verbs; | 40 | std::map<std::string, verb> verbs; |
41 | std::map<std::string, adjective> adjectives; | ||
42 | std::map<std::string, noun> nouns; | ||
28 | std::map<int, std::map<int, int>> wn; | 43 | std::map<int, std::map<int, int>> wn; |
44 | std::map<std::string, std::set<std::string>> pronunciations; | ||
29 | 45 | ||
30 | void print_usage() | 46 | void print_usage() |
31 | { | 47 | { |
32 | std::cout << "Verbly Datafile Generator" << std::endl; | 48 | std::cout << "Verbly Datafile Generator" << std::endl; |
33 | std::cout << "-------------------------" << std::endl; | 49 | std::cout << "-------------------------" << std::endl; |
34 | std::cout << "Requires exactly four arguments." << std::endl; | 50 | std::cout << "Requires exactly six arguments." << std::endl; |
35 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | 51 | std::cout << "1. The path to a VerbNet data directory." << std::endl; |
36 | std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; | 52 | std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; |
37 | std::cout << "3. The path to an AGID infl.txt file." << std::endl; | 53 | std::cout << "3. The path to an AGID infl.txt file." << std::endl; |
38 | std::cout << "4. The path to a WordNet prolog data directory." << std::endl; | 54 | std::cout << "4. The path to a WordNet prolog data directory." << std::endl; |
39 | std::cout << "5. Datafile output path." << std::endl; | 55 | std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl; |
56 | std::cout << "6. Datafile output path." << std::endl; | ||
40 | 57 | ||
41 | exit(1); | 58 | exit(1); |
42 | } | 59 | } |
60 | |||
61 | void db_error(sqlite3* ppdb, std::string) | ||
62 | { | ||
63 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
64 | sqlite3_close_v2(ppdb); | ||
65 | print_usage(); | ||
66 | } | ||
67 | |||
43 | /* | 68 | /* |
44 | void parse_group(xmlNodePtr top, std::string filename) | 69 | void parse_group(xmlNodePtr top, std::string filename) |
45 | { | 70 | { |
@@ -87,7 +112,7 @@ void parse_group(xmlNodePtr top, std::string filename) | |||
87 | 112 | ||
88 | int main(int argc, char** argv) | 113 | int main(int argc, char** argv) |
89 | { | 114 | { |
90 | if (argc != 6) | 115 | if (argc != 7) |
91 | { | 116 | { |
92 | print_usage(); | 117 | print_usage(); |
93 | } | 118 | } |
@@ -137,7 +162,7 @@ int main(int argc, char** argv) | |||
137 | closedir(dir);*/ | 162 | closedir(dir);*/ |
138 | 163 | ||
139 | // Get verbs from AGID | 164 | // Get verbs from AGID |
140 | std::cout << "Reading verb inflection..." << std::endl; | 165 | std::cout << "Reading inflections..." << std::endl; |
141 | 166 | ||
142 | std::ifstream agidfile(argv[3]); | 167 | std::ifstream agidfile(argv[3]); |
143 | if (!agidfile.is_open()) | 168 | if (!agidfile.is_open()) |
@@ -162,11 +187,7 @@ int main(int argc, char** argv) | |||
162 | int divider = line.find_first_of(" "); | 187 | int divider = line.find_first_of(" "); |
163 | std::string word = line.substr(0, divider); | 188 | std::string word = line.substr(0, divider); |
164 | line = line.substr(divider+1); | 189 | line = line.substr(divider+1); |
165 | 190 | char type = line[0]; | |
166 | if (line[0] != 'V') | ||
167 | { | ||
168 | continue; | ||
169 | } | ||
170 | 191 | ||
171 | if (line[1] == '?') | 192 | if (line[1] == '?') |
172 | { | 193 | { |
@@ -174,7 +195,7 @@ int main(int argc, char** argv) | |||
174 | } else { | 195 | } else { |
175 | line.erase(0, 3); | 196 | line.erase(0, 3); |
176 | } | 197 | } |
177 | 198 | ||
178 | std::vector<std::string> forms; | 199 | std::vector<std::string> forms; |
179 | while (!line.empty()) | 200 | while (!line.empty()) |
180 | { | 201 | { |
@@ -187,52 +208,129 @@ int main(int argc, char** argv) | |||
187 | inflection = line; | 208 | inflection = line; |
188 | line = ""; | 209 | line = ""; |
189 | } | 210 | } |
190 | 211 | ||
191 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | 212 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) |
192 | { | 213 | { |
193 | inflection = inflection.substr(0, divider); | 214 | inflection = inflection.substr(0, divider); |
194 | } | 215 | } |
195 | 216 | ||
196 | forms.push_back(inflection); | 217 | forms.push_back(inflection); |
197 | } | 218 | } |
198 | 219 | ||
199 | verb v; | 220 | switch (type) |
200 | v.infinitive = word; | ||
201 | if (forms.size() == 4) | ||
202 | { | 221 | { |
203 | v.past_tense = forms[0]; | 222 | case 'V': |
204 | v.past_participle = forms[1]; | 223 | { |
205 | v.ing_form = forms[2]; | 224 | verb v; |
206 | v.s_form = forms[3]; | 225 | v.infinitive = word; |
207 | } else if (forms.size() == 3) | 226 | if (forms.size() == 4) |
227 | { | ||
228 | v.past_tense = forms[0]; | ||
229 | v.past_participle = forms[1]; | ||
230 | v.ing_form = forms[2]; | ||
231 | v.s_form = forms[3]; | ||
232 | } else if (forms.size() == 3) | ||
233 | { | ||
234 | v.past_tense = forms[0]; | ||
235 | v.past_participle = forms[0]; | ||
236 | v.ing_form = forms[1]; | ||
237 | v.s_form = forms[2]; | ||
238 | } else if (forms.size() == 8) | ||
239 | { | ||
240 | // As of AGID 2014.08.11, this is only "to be" | ||
241 | v.past_tense = forms[0]; | ||
242 | v.past_participle = forms[2]; | ||
243 | v.ing_form = forms[3]; | ||
244 | v.s_form = forms[4]; | ||
245 | } else { | ||
246 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
247 | // - may and shall do not conjugate the way we want them to | ||
248 | // - methinks only has a past tense and is an outlier | ||
249 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
250 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
251 | } | ||
252 | |||
253 | verbs[word] = v; | ||
254 | |||
255 | break; | ||
256 | } | ||
257 | |||
258 | case 'A': | ||
259 | { | ||
260 | adjective adj; | ||
261 | adj.base = word; | ||
262 | if (forms.size() == 2) | ||
263 | { | ||
264 | adj.comparative = forms[0]; | ||
265 | adj.superlative = forms[1]; | ||
266 | } else { | ||
267 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | ||
268 | std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
269 | } | ||
270 | |||
271 | adjectives[word] = adj; | ||
272 | |||
273 | break; | ||
274 | } | ||
275 | |||
276 | case 'N': | ||
277 | { | ||
278 | noun n; | ||
279 | n.singular = word; | ||
280 | if (forms.size() == 1) | ||
281 | { | ||
282 | n.plural = forms[0]; | ||
283 | } else { | ||
284 | // As of AGID 2014.08.11, this is non-existent. | ||
285 | std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
286 | } | ||
287 | |||
288 | nouns[word] = n; | ||
289 | |||
290 | break; | ||
291 | } | ||
292 | } | ||
293 | } | ||
294 | |||
295 | // Pronounciations | ||
296 | std::cout << "Reading pronunciations..." << std::endl; | ||
297 | |||
298 | std::ifstream pronfile(argv[5]); | ||
299 | if (!pronfile.is_open()) | ||
300 | { | ||
301 | std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl; | ||
302 | print_usage(); | ||
303 | } | ||
304 | |||
305 | for (;;) | ||
306 | { | ||
307 | std::string line; | ||
308 | if (!getline(pronfile, line)) | ||
208 | { | 309 | { |
209 | v.past_tense = forms[0]; | 310 | break; |
210 | v.past_participle = forms[0]; | 311 | } |
211 | v.ing_form = forms[1]; | 312 | |
212 | v.s_form = forms[2]; | 313 | if (line.back() == '\r') |
213 | } else if (forms.size() == 8) | ||
214 | { | 314 | { |
215 | // As of AGID 2014.08.11, this is only "to be" | 315 | line.pop_back(); |
216 | v.past_tense = forms[0]; | ||
217 | v.past_participle = forms[2]; | ||
218 | v.ing_form = forms[3]; | ||
219 | v.s_form = forms[4]; | ||
220 | } else { | ||
221 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
222 | // - may and shall do not conjugate the way we want them to | ||
223 | // - methinks only has a past tense and is an outlier | ||
224 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
225 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
226 | } | 316 | } |
227 | 317 | ||
228 | verbs[word] = v; | 318 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); |
319 | std::smatch phoneme_data; | ||
320 | if (std::regex_search(line, phoneme_data, phoneme)) | ||
321 | { | ||
322 | std::string canonical(phoneme_data[1]); | ||
323 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
324 | |||
325 | pronunciations[canonical].insert(phoneme_data[2]); | ||
326 | } | ||
229 | } | 327 | } |
230 | 328 | ||
231 | // Start writing output | 329 | // Start writing output |
232 | std::cout << "Writing output..." << std::endl; | 330 | std::cout << "Writing schema..." << std::endl; |
233 | 331 | ||
234 | sqlite3* ppdb; | 332 | sqlite3* ppdb; |
235 | if (sqlite3_open_v2(argv[5], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | 333 | if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) |
236 | { | 334 | { |
237 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | 335 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; |
238 | print_usage(); | 336 | print_usage(); |
@@ -278,47 +376,82 @@ int main(int argc, char** argv) | |||
278 | sqlite3_stmt* schmstmt; | 376 | sqlite3_stmt* schmstmt; |
279 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | 377 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) |
280 | { | 378 | { |
281 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 379 | db_error(ppdb, query); |
282 | sqlite3_close_v2(ppdb); | ||
283 | print_usage(); | ||
284 | } | 380 | } |
285 | 381 | ||
286 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | 382 | if (sqlite3_step(schmstmt) != SQLITE_DONE) |
287 | { | 383 | { |
288 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 384 | db_error(ppdb, query); |
289 | sqlite3_close_v2(ppdb); | ||
290 | print_usage(); | ||
291 | } | 385 | } |
292 | 386 | ||
293 | sqlite3_finalize(schmstmt); | 387 | sqlite3_finalize(schmstmt); |
294 | } | 388 | } |
295 | 389 | ||
296 | std::cout << "Writing verbs..." << std::endl; | ||
297 | for (auto& mapping : verbs) | ||
298 | { | 390 | { |
299 | sqlite3_stmt* ppstmt; | 391 | progress ppgs("Writing verbs...", verbs.size()); |
300 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | 392 | for (auto& mapping : verbs) |
301 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
302 | { | 393 | { |
303 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 394 | sqlite3_stmt* ppstmt; |
304 | sqlite3_close_v2(ppdb); | 395 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); |
305 | print_usage(); | 396 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) |
306 | } | 397 | { |
398 | db_error(ppdb, query); | ||
399 | } | ||
307 | 400 | ||
308 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); | 401 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); |
309 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); | 402 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); |
310 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); | 403 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); |
311 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); | 404 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); |
312 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); | 405 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); |
313 | 406 | ||
314 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 407 | if (sqlite3_step(ppstmt) != SQLITE_DONE) |
315 | { | 408 | { |
316 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 409 | db_error(ppdb, query); |
317 | sqlite3_close_v2(ppdb); | 410 | } |
318 | print_usage(); | ||
319 | } | ||
320 | 411 | ||
321 | sqlite3_finalize(ppstmt); | 412 | sqlite3_finalize(ppstmt); |
413 | |||
414 | std::string canonical(mapping.second.infinitive); | ||
415 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
416 | if (pronunciations.count(canonical) == 1) | ||
417 | { | ||
418 | query = "SELECT last_insert_rowid()"; | ||
419 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
420 | { | ||
421 | db_error(ppdb, query); | ||
422 | } | ||
423 | |||
424 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
425 | { | ||
426 | db_error(ppdb, query); | ||
427 | } | ||
428 | |||
429 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
430 | |||
431 | sqlite3_finalize(ppstmt); | ||
432 | |||
433 | for (auto pronunciation : pronunciations[canonical]) | ||
434 | { | ||
435 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; | ||
436 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
437 | { | ||
438 | db_error(ppdb, query); | ||
439 | } | ||
440 | |||
441 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
442 | sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); | ||
443 | |||
444 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
445 | { | ||
446 | db_error(ppdb, query); | ||
447 | } | ||
448 | |||
449 | sqlite3_finalize(ppstmt); | ||
450 | } | ||
451 | } | ||
452 | |||
453 | ppgs.update(); | ||
454 | } | ||
322 | } | 455 | } |
323 | 456 | ||
324 | // Get nouns/adjectives/adverbs from WordNet | 457 | // Get nouns/adjectives/adverbs from WordNet |
@@ -342,110 +475,1046 @@ int main(int argc, char** argv) | |||
342 | wnpref += '/'; | 475 | wnpref += '/'; |
343 | } | 476 | } |
344 | 477 | ||
345 | std::cout << "Reading words from WordNet..." << std::endl; | 478 | // s table |
346 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
347 | if (!wnsfile.is_open()) | ||
348 | { | 479 | { |
349 | std::cout << "Invalid WordNet data directory." << std::endl; | 480 | std::ifstream wnsfile(wnpref + "wn_s.pl"); |
350 | print_usage(); | 481 | if (!wnsfile.is_open()) |
482 | { | ||
483 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
484 | print_usage(); | ||
485 | } | ||
486 | |||
487 | std::list<std::string> lines; | ||
488 | for (;;) | ||
489 | { | ||
490 | std::string line; | ||
491 | if (!getline(wnsfile, line)) | ||
492 | { | ||
493 | break; | ||
494 | } | ||
495 | |||
496 | if (line.back() == '\r') | ||
497 | { | ||
498 | line.pop_back(); | ||
499 | } | ||
500 | |||
501 | lines.push_back(line); | ||
502 | } | ||
503 | |||
504 | progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); | ||
505 | for (auto line : lines) | ||
506 | { | ||
507 | ppgs.update(); | ||
508 | |||
509 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); | ||
510 | std::smatch relation_data; | ||
511 | if (!std::regex_search(line, relation_data, relation)) | ||
512 | { | ||
513 | continue; | ||
514 | } | ||
515 | |||
516 | int synset_id = stoi(relation_data[1]); | ||
517 | int wnum = stoi(relation_data[2]); | ||
518 | std::string word = relation_data[3]; | ||
519 | |||
520 | std::string query; | ||
521 | switch (synset_id / 100000000) | ||
522 | { | ||
523 | case 1: // Noun | ||
524 | { | ||
525 | if (nouns.count(word) == 1) | ||
526 | { | ||
527 | query = "INSERT INTO nouns (singular, plural) VALUES (?, ?)"; | ||
528 | } else { | ||
529 | query = "INSERT INTO nouns (singular) VALUES (?)"; | ||
530 | } | ||
531 | |||
532 | break; | ||
533 | } | ||
534 | |||
535 | case 2: // Verb | ||
536 | { | ||
537 | // Ignore | ||
538 | |||
539 | break; | ||
540 | } | ||
541 | |||
542 | case 3: // Adjective | ||
543 | { | ||
544 | if (adjectives.count(word) == 1) | ||
545 | { | ||
546 | query = "INSERT INTO adjectives (base_form, comparative, superlative) VALUES (?, ?, ?)"; | ||
547 | } else { | ||
548 | query = "INSERT INTO adjectives (base_form) VALUES (?)"; | ||
549 | } | ||
550 | |||
551 | break; | ||
552 | } | ||
553 | |||
554 | case 4: // Adverb | ||
555 | { | ||
556 | if (adjectives.count(word) == 1) | ||
557 | { | ||
558 | query = "INSERT INTO adverbs (base_form, comparative, superlative) VALUES (?, ?, ?)"; | ||
559 | } else { | ||
560 | query = "INSERT INTO adverbs (base_form) VALUES (?)"; | ||
561 | } | ||
562 | |||
563 | break; | ||
564 | } | ||
565 | } | ||
566 | |||
567 | sqlite3_stmt* ppstmt; | ||
568 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
569 | { | ||
570 | db_error(ppdb, query); | ||
571 | } | ||
572 | |||
573 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); | ||
574 | switch (synset_id / 100000000) | ||
575 | { | ||
576 | case 1: // Noun | ||
577 | { | ||
578 | if (nouns.count(word) == 1) | ||
579 | { | ||
580 | sqlite3_bind_text(ppstmt, 2, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC); | ||
581 | } | ||
582 | |||
583 | break; | ||
584 | } | ||
585 | |||
586 | case 3: // Adjective | ||
587 | case 4: // Adverb | ||
588 | { | ||
589 | if (adjectives.count(word) == 1) | ||
590 | { | ||
591 | sqlite3_bind_text(ppstmt, 2, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_STATIC); | ||
592 | sqlite3_bind_text(ppstmt, 3, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_STATIC); | ||
593 | } | ||
594 | |||
595 | break; | ||
596 | } | ||
597 | } | ||
598 | |||
599 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
600 | { | ||
601 | db_error(ppdb, query); | ||
602 | } | ||
603 | |||
604 | sqlite3_finalize(ppstmt); | ||
605 | |||
606 | query = "SELECT last_insert_rowid()"; | ||
607 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
608 | { | ||
609 | db_error(ppdb, query); | ||
610 | } | ||
611 | |||
612 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
613 | { | ||
614 | db_error(ppdb, query); | ||
615 | } | ||
616 | |||
617 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
618 | wn[synset_id][wnum] = rowid; | ||
619 | |||
620 | sqlite3_finalize(ppstmt); | ||
621 | |||
622 | std::string canonical(word); | ||
623 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
624 | if (pronunciations.count(canonical) == 1) | ||
625 | { | ||
626 | for (auto pronunciation : pronunciations[canonical]) | ||
627 | { | ||
628 | switch (synset_id / 100000000) | ||
629 | { | ||
630 | case 1: // Noun | ||
631 | { | ||
632 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)"; | ||
633 | |||
634 | break; | ||
635 | } | ||
636 | |||
637 | case 3: // Adjective | ||
638 | { | ||
639 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)"; | ||
640 | |||
641 | break; | ||
642 | } | ||
643 | |||
644 | case 4: // Adverb | ||
645 | { | ||
646 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)"; | ||
647 | |||
648 | break; | ||
649 | } | ||
650 | } | ||
651 | |||
652 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
653 | { | ||
654 | db_error(ppdb, query); | ||
655 | } | ||
656 | |||
657 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
658 | sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC); | ||
659 | |||
660 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
661 | { | ||
662 | db_error(ppdb, query); | ||
663 | } | ||
664 | |||
665 | sqlite3_finalize(ppstmt); | ||
666 | } | ||
667 | } | ||
668 | } | ||
351 | } | 669 | } |
352 | 670 | ||
353 | for (;;) | 671 | // While we're working on s |
354 | { | 672 | { |
355 | std::string line; | 673 | progress ppgs("Writing word synonyms...", wn.size()); |
356 | if (!getline(wnsfile, line)) | 674 | for (auto sense : wn) |
357 | { | 675 | { |
358 | break; | 676 | ppgs.update(); |
677 | |||
678 | for (auto word1 : sense.second) | ||
679 | { | ||
680 | for (auto word2 : sense.second) | ||
681 | { | ||
682 | if (word1 != word2) | ||
683 | { | ||
684 | std::string query; | ||
685 | switch (sense.first / 100000000) | ||
686 | { | ||
687 | case 1: // Noun | ||
688 | { | ||
689 | query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
690 | |||
691 | break; | ||
692 | } | ||
693 | |||
694 | case 2: // Verb | ||
695 | { | ||
696 | // Ignore | ||
697 | |||
698 | break; | ||
699 | } | ||
700 | |||
701 | case 3: // Adjective | ||
702 | { | ||
703 | query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
704 | |||
705 | break; | ||
706 | } | ||
707 | |||
708 | case 4: // Adverb | ||
709 | { | ||
710 | query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
711 | |||
712 | break; | ||
713 | } | ||
714 | } | ||
715 | |||
716 | sqlite3_stmt* ppstmt; | ||
717 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
718 | { | ||
719 | db_error(ppdb, query); | ||
720 | } | ||
721 | |||
722 | sqlite3_bind_int(ppstmt, 1, word1.second); | ||
723 | sqlite3_bind_int(ppstmt, 2, word2.second); | ||
724 | |||
725 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
726 | { | ||
727 | db_error(ppdb, query); | ||
728 | } | ||
729 | |||
730 | sqlite3_finalize(ppstmt); | ||
731 | } | ||
732 | } | ||
733 | } | ||
734 | } | ||
735 | } | ||
736 | |||
737 | // ant table | ||
738 | { | ||
739 | std::ifstream wnantfile(wnpref + "wn_ant.pl"); | ||
740 | if (!wnantfile.is_open()) | ||
741 | { | ||
742 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
743 | print_usage(); | ||
359 | } | 744 | } |
745 | |||
746 | std::list<std::string> lines; | ||
747 | for (;;) | ||
748 | { | ||
749 | std::string line; | ||
750 | if (!getline(wnantfile, line)) | ||
751 | { | ||
752 | break; | ||
753 | } | ||
360 | 754 | ||
361 | if (line.back() == '\r') | 755 | if (line.back() == '\r') |
756 | { | ||
757 | line.pop_back(); | ||
758 | } | ||
759 | |||
760 | lines.push_back(line); | ||
761 | } | ||
762 | |||
763 | progress ppgs("Writing antonyms...", lines.size()); | ||
764 | for (auto line : lines) | ||
362 | { | 765 | { |
363 | line.pop_back(); | 766 | ppgs.update(); |
767 | |||
768 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
769 | std::smatch relation_data; | ||
770 | if (!std::regex_search(line, relation_data, relation)) | ||
771 | { | ||
772 | continue; | ||
773 | } | ||
774 | |||
775 | int synset_id_1 = stoi(relation_data[1]); | ||
776 | int wnum_1 = stoi(relation_data[2]); | ||
777 | int synset_id_2 = stoi(relation_data[3]); | ||
778 | int wnum_2 = stoi(relation_data[4]); | ||
779 | |||
780 | std::string query; | ||
781 | switch (synset_id_1 / 100000000) | ||
782 | { | ||
783 | case 1: // Noun | ||
784 | { | ||
785 | query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
786 | |||
787 | break; | ||
788 | } | ||
789 | |||
790 | case 2: // Verb | ||
791 | { | ||
792 | // Ignore | ||
793 | |||
794 | break; | ||
795 | } | ||
796 | |||
797 | case 3: // Adjective | ||
798 | { | ||
799 | query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
800 | |||
801 | break; | ||
802 | } | ||
803 | |||
804 | case 4: // Adverb | ||
805 | { | ||
806 | query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
807 | |||
808 | break; | ||
809 | } | ||
810 | } | ||
811 | |||
812 | sqlite3_stmt* ppstmt; | ||
813 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
814 | { | ||
815 | db_error(ppdb, query); | ||
816 | } | ||
817 | |||
818 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
819 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
820 | |||
821 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
822 | { | ||
823 | db_error(ppdb, query); | ||
824 | } | ||
825 | |||
826 | sqlite3_finalize(ppstmt); | ||
827 | } | ||
828 | } | ||
829 | |||
830 | // at table | ||
831 | { | ||
832 | std::ifstream wnatfile(wnpref + "wn_at.pl"); | ||
833 | if (!wnatfile.is_open()) | ||
834 | { | ||
835 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
836 | print_usage(); | ||
364 | } | 837 | } |
838 | |||
839 | std::list<std::string> lines; | ||
840 | for (;;) | ||
841 | { | ||
842 | std::string line; | ||
843 | if (!getline(wnatfile, line)) | ||
844 | { | ||
845 | break; | ||
846 | } | ||
365 | 847 | ||
366 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); | 848 | if (line.back() == '\r') |
367 | std::smatch relation_data; | 849 | { |
368 | if (!std::regex_search(line, relation_data, relation)) | 850 | line.pop_back(); |
851 | } | ||
852 | |||
853 | lines.push_back(line); | ||
854 | } | ||
855 | |||
856 | progress ppgs("Writing variations...", lines.size()); | ||
857 | for (auto line : lines) | ||
369 | { | 858 | { |
370 | continue; | 859 | ppgs.update(); |
860 | |||
861 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); | ||
862 | std::smatch relation_data; | ||
863 | if (!std::regex_search(line, relation_data, relation)) | ||
864 | { | ||
865 | continue; | ||
866 | } | ||
867 | |||
868 | int synset_id_1 = stoi(relation_data[1]); | ||
869 | int synset_id_2 = stoi(relation_data[2]); | ||
870 | std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); | ||
871 | |||
872 | for (auto mapping1 : wn[synset_id_1]) | ||
873 | { | ||
874 | for (auto mapping2 : wn[synset_id_2]) | ||
875 | { | ||
876 | sqlite3_stmt* ppstmt; | ||
877 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
878 | { | ||
879 | db_error(ppdb, query); | ||
880 | } | ||
881 | |||
882 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
883 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
884 | |||
885 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
886 | { | ||
887 | db_error(ppdb, query); | ||
888 | } | ||
889 | |||
890 | sqlite3_finalize(ppstmt); | ||
891 | } | ||
892 | } | ||
371 | } | 893 | } |
894 | } | ||
895 | |||
896 | // hyp table | ||
897 | { | ||
898 | std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); | ||
899 | if (!wnhypfile.is_open()) | ||
900 | { | ||
901 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
902 | print_usage(); | ||
903 | } | ||
904 | |||
905 | std::list<std::string> lines; | ||
906 | for (;;) | ||
907 | { | ||
908 | std::string line; | ||
909 | if (!getline(wnhypfile, line)) | ||
910 | { | ||
911 | break; | ||
912 | } | ||
372 | 913 | ||
373 | int synset_id = stoi(relation_data[1]); | 914 | if (line.back() == '\r') |
374 | int wnum = stoi(relation_data[2]); | 915 | { |
375 | std::string word = relation_data[3]; | 916 | line.pop_back(); |
917 | } | ||
918 | |||
919 | lines.push_back(line); | ||
920 | } | ||
376 | 921 | ||
377 | std::string query; | 922 | progress ppgs("Writing hypernyms...", lines.size()); |
378 | switch (synset_id / 100000000) | 923 | for (auto line : lines) |
379 | { | 924 | { |
380 | case 1: // Noun | 925 | ppgs.update(); |
926 | |||
927 | std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
928 | std::smatch relation_data; | ||
929 | if (!std::regex_search(line, relation_data, relation)) | ||
930 | { | ||
931 | continue; | ||
932 | } | ||
933 | |||
934 | int synset_id_1 = stoi(relation_data[1]); | ||
935 | int synset_id_2 = stoi(relation_data[2]); | ||
936 | std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); | ||
937 | |||
938 | for (auto mapping1 : wn[synset_id_1]) | ||
939 | { | ||
940 | for (auto mapping2 : wn[synset_id_2]) | ||
941 | { | ||
942 | sqlite3_stmt* ppstmt; | ||
943 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
944 | { | ||
945 | db_error(ppdb, query); | ||
946 | } | ||
947 | |||
948 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
949 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
950 | |||
951 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
952 | { | ||
953 | db_error(ppdb, query); | ||
954 | } | ||
955 | |||
956 | sqlite3_finalize(ppstmt); | ||
957 | } | ||
958 | } | ||
959 | } | ||
960 | } | ||
961 | |||
962 | // ins table | ||
963 | { | ||
964 | std::ifstream wninsfile(wnpref + "wn_ins.pl"); | ||
965 | if (!wninsfile.is_open()) | ||
966 | { | ||
967 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
968 | print_usage(); | ||
969 | } | ||
970 | |||
971 | std::list<std::string> lines; | ||
972 | for (;;) | ||
973 | { | ||
974 | std::string line; | ||
975 | if (!getline(wninsfile, line)) | ||
381 | { | 976 | { |
382 | query = "INSERT INTO nouns (form) VALUES (?)"; | ||
383 | |||
384 | break; | 977 | break; |
385 | } | 978 | } |
979 | |||
980 | if (line.back() == '\r') | ||
981 | { | ||
982 | line.pop_back(); | ||
983 | } | ||
386 | 984 | ||
387 | case 2: // Verb | 985 | lines.push_back(line); |
986 | } | ||
987 | |||
988 | progress ppgs("Writing instantiations...", lines.size()); | ||
989 | for (auto line : lines) | ||
990 | { | ||
991 | ppgs.update(); | ||
992 | |||
993 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
994 | std::smatch relation_data; | ||
995 | if (!std::regex_search(line, relation_data, relation)) | ||
996 | { | ||
997 | continue; | ||
998 | } | ||
999 | |||
1000 | int synset_id_1 = stoi(relation_data[1]); | ||
1001 | int synset_id_2 = stoi(relation_data[2]); | ||
1002 | std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); | ||
1003 | |||
1004 | for (auto mapping1 : wn[synset_id_1]) | ||
1005 | { | ||
1006 | for (auto mapping2 : wn[synset_id_2]) | ||
1007 | { | ||
1008 | sqlite3_stmt* ppstmt; | ||
1009 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1010 | { | ||
1011 | db_error(ppdb, query); | ||
1012 | } | ||
1013 | |||
1014 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1015 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1016 | |||
1017 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1018 | { | ||
1019 | db_error(ppdb, query); | ||
1020 | } | ||
1021 | |||
1022 | sqlite3_finalize(ppstmt); | ||
1023 | } | ||
1024 | } | ||
1025 | } | ||
1026 | } | ||
1027 | |||
1028 | // mm table | ||
1029 | { | ||
1030 | std::ifstream wnmmfile(wnpref + "wn_mm.pl"); | ||
1031 | if (!wnmmfile.is_open()) | ||
1032 | { | ||
1033 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1034 | print_usage(); | ||
1035 | } | ||
1036 | |||
1037 | std::list<std::string> lines; | ||
1038 | for (;;) | ||
1039 | { | ||
1040 | std::string line; | ||
1041 | if (!getline(wnmmfile, line)) | ||
388 | { | 1042 | { |
389 | // Ignore | ||
390 | |||
391 | break; | 1043 | break; |
392 | } | 1044 | } |
1045 | |||
1046 | if (line.back() == '\r') | ||
1047 | { | ||
1048 | line.pop_back(); | ||
1049 | } | ||
1050 | |||
1051 | lines.push_back(line); | ||
1052 | } | ||
1053 | |||
1054 | progress ppgs("Writing member meronyms...", lines.size()); | ||
1055 | for (auto line : lines) | ||
1056 | { | ||
1057 | ppgs.update(); | ||
393 | 1058 | ||
394 | case 3: // Adjective | 1059 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); |
1060 | std::smatch relation_data; | ||
1061 | if (!std::regex_search(line, relation_data, relation)) | ||
1062 | { | ||
1063 | continue; | ||
1064 | } | ||
1065 | |||
1066 | int synset_id_1 = stoi(relation_data[1]); | ||
1067 | int synset_id_2 = stoi(relation_data[2]); | ||
1068 | std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1069 | |||
1070 | for (auto mapping1 : wn[synset_id_1]) | ||
1071 | { | ||
1072 | for (auto mapping2 : wn[synset_id_2]) | ||
1073 | { | ||
1074 | sqlite3_stmt* ppstmt; | ||
1075 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1076 | { | ||
1077 | db_error(ppdb, query); | ||
1078 | } | ||
1079 | |||
1080 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1081 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1082 | |||
1083 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1084 | { | ||
1085 | db_error(ppdb, query); | ||
1086 | } | ||
1087 | |||
1088 | sqlite3_finalize(ppstmt); | ||
1089 | } | ||
1090 | } | ||
1091 | } | ||
1092 | } | ||
1093 | |||
1094 | // ms table | ||
1095 | { | ||
1096 | std::ifstream wnmsfile(wnpref + "wn_ms.pl"); | ||
1097 | if (!wnmsfile.is_open()) | ||
1098 | { | ||
1099 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1100 | print_usage(); | ||
1101 | } | ||
1102 | |||
1103 | std::list<std::string> lines; | ||
1104 | for (;;) | ||
1105 | { | ||
1106 | std::string line; | ||
1107 | if (!getline(wnmsfile, line)) | ||
395 | { | 1108 | { |
396 | query = "INSERT INTO adjectives (form) VALUES (?)"; | ||
397 | |||
398 | break; | 1109 | break; |
399 | } | 1110 | } |
1111 | |||
1112 | if (line.back() == '\r') | ||
1113 | { | ||
1114 | line.pop_back(); | ||
1115 | } | ||
400 | 1116 | ||
401 | case 4: // Adverb | 1117 | lines.push_back(line); |
1118 | } | ||
1119 | |||
1120 | progress ppgs("Writing substance meronyms...", lines.size()); | ||
1121 | for (auto line : lines) | ||
1122 | { | ||
1123 | ppgs.update(); | ||
1124 | |||
1125 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
1126 | std::smatch relation_data; | ||
1127 | if (!std::regex_search(line, relation_data, relation)) | ||
1128 | { | ||
1129 | continue; | ||
1130 | } | ||
1131 | |||
1132 | int synset_id_1 = stoi(relation_data[1]); | ||
1133 | int synset_id_2 = stoi(relation_data[2]); | ||
1134 | std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1135 | |||
1136 | for (auto mapping1 : wn[synset_id_1]) | ||
1137 | { | ||
1138 | for (auto mapping2 : wn[synset_id_2]) | ||
1139 | { | ||
1140 | sqlite3_stmt* ppstmt; | ||
1141 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1142 | { | ||
1143 | db_error(ppdb, query); | ||
1144 | } | ||
1145 | |||
1146 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1147 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1148 | |||
1149 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1150 | { | ||
1151 | db_error(ppdb, query); | ||
1152 | } | ||
1153 | |||
1154 | sqlite3_finalize(ppstmt); | ||
1155 | } | ||
1156 | } | ||
1157 | } | ||
1158 | } | ||
1159 | |||
1160 | // mm table | ||
1161 | { | ||
1162 | std::ifstream wnmpfile(wnpref + "wn_mp.pl"); | ||
1163 | if (!wnmpfile.is_open()) | ||
1164 | { | ||
1165 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1166 | print_usage(); | ||
1167 | } | ||
1168 | |||
1169 | std::list<std::string> lines; | ||
1170 | for (;;) | ||
1171 | { | ||
1172 | std::string line; | ||
1173 | if (!getline(wnmpfile, line)) | ||
402 | { | 1174 | { |
403 | query = "INSERT INTO adverbs (form) VALUES (?)"; | ||
404 | |||
405 | break; | 1175 | break; |
406 | } | 1176 | } |
1177 | |||
1178 | if (line.back() == '\r') | ||
1179 | { | ||
1180 | line.pop_back(); | ||
1181 | } | ||
1182 | |||
1183 | lines.push_back(line); | ||
407 | } | 1184 | } |
408 | 1185 | ||
409 | sqlite3_stmt* ppstmt; | 1186 | progress ppgs("Writing part meronyms...", lines.size()); |
410 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 1187 | for (auto line : lines) |
1188 | { | ||
1189 | ppgs.update(); | ||
1190 | |||
1191 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
1192 | std::smatch relation_data; | ||
1193 | if (!std::regex_search(line, relation_data, relation)) | ||
1194 | { | ||
1195 | continue; | ||
1196 | } | ||
1197 | |||
1198 | int synset_id_1 = stoi(relation_data[1]); | ||
1199 | int synset_id_2 = stoi(relation_data[2]); | ||
1200 | std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1201 | |||
1202 | for (auto mapping1 : wn[synset_id_1]) | ||
1203 | { | ||
1204 | for (auto mapping2 : wn[synset_id_2]) | ||
1205 | { | ||
1206 | sqlite3_stmt* ppstmt; | ||
1207 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1208 | { | ||
1209 | db_error(ppdb, query); | ||
1210 | } | ||
1211 | |||
1212 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1213 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1214 | |||
1215 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1216 | { | ||
1217 | db_error(ppdb, query); | ||
1218 | } | ||
1219 | |||
1220 | sqlite3_finalize(ppstmt); | ||
1221 | } | ||
1222 | } | ||
1223 | } | ||
1224 | } | ||
1225 | |||
1226 | // per table | ||
1227 | { | ||
1228 | std::ifstream wnperfile(wnpref + "wn_per.pl"); | ||
1229 | if (!wnperfile.is_open()) | ||
411 | { | 1230 | { |
412 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 1231 | std::cout << "Invalid WordNet data directory." << std::endl; |
413 | sqlite3_close_v2(ppdb); | ||
414 | print_usage(); | 1232 | print_usage(); |
415 | } | 1233 | } |
1234 | |||
1235 | std::list<std::string> lines; | ||
1236 | for (;;) | ||
1237 | { | ||
1238 | std::string line; | ||
1239 | if (!getline(wnperfile, line)) | ||
1240 | { | ||
1241 | break; | ||
1242 | } | ||
1243 | |||
1244 | if (line.back() == '\r') | ||
1245 | { | ||
1246 | line.pop_back(); | ||
1247 | } | ||
1248 | |||
1249 | lines.push_back(line); | ||
1250 | } | ||
1251 | |||
1252 | progress ppgs("Writing pertainyms and mannernyms...", lines.size()); | ||
1253 | for (auto line : lines) | ||
1254 | { | ||
1255 | ppgs.update(); | ||
1256 | |||
1257 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | ||
1258 | std::smatch relation_data; | ||
1259 | if (!std::regex_search(line, relation_data, relation)) | ||
1260 | { | ||
1261 | continue; | ||
1262 | } | ||
1263 | |||
1264 | int synset_id_1 = stoi(relation_data[1]); | ||
1265 | int wnum_1 = stoi(relation_data[2]); | ||
1266 | int synset_id_2 = stoi(relation_data[3]); | ||
1267 | int wnum_2 = stoi(relation_data[4]); | ||
1268 | std::string query; | ||
1269 | switch (synset_id_1 / 100000000) | ||
1270 | { | ||
1271 | case 3: // Adjective | ||
1272 | { | ||
1273 | // This is a pertainym, the second word should be a noun | ||
1274 | // Technically it can be an adjective but we're ignoring that | ||
1275 | if (synset_id_2 / 100000000 != 1) | ||
1276 | { | ||
1277 | continue; | ||
1278 | } | ||
1279 | |||
1280 | query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; | ||
1281 | |||
1282 | break; | ||
1283 | } | ||
1284 | |||
1285 | case 4: // Adverb | ||
1286 | { | ||
1287 | // This is a mannernym, the second word should be an adjective | ||
1288 | if (synset_id_2 / 100000000 != 3) | ||
1289 | { | ||
1290 | continue; | ||
1291 | } | ||
1292 | |||
1293 | query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; | ||
1294 | |||
1295 | break; | ||
1296 | } | ||
1297 | } | ||
1298 | |||
1299 | sqlite3_stmt* ppstmt; | ||
1300 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1301 | { | ||
1302 | db_error(ppdb, query); | ||
1303 | } | ||
416 | 1304 | ||
417 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); | 1305 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); |
1306 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
418 | 1307 | ||
419 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1308 | if (sqlite3_step(ppstmt) != SQLITE_DONE) |
1309 | { | ||
1310 | db_error(ppdb, query); | ||
1311 | } | ||
1312 | |||
1313 | sqlite3_finalize(ppstmt); | ||
1314 | } | ||
1315 | } | ||
1316 | |||
1317 | // sa table | ||
1318 | { | ||
1319 | std::ifstream wnsafile(wnpref + "wn_sa.pl"); | ||
1320 | if (!wnsafile.is_open()) | ||
420 | { | 1321 | { |
421 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 1322 | std::cout << "Invalid WordNet data directory." << std::endl; |
422 | sqlite3_close_v2(ppdb); | ||
423 | print_usage(); | 1323 | print_usage(); |
424 | } | 1324 | } |
1325 | |||
1326 | std::list<std::string> lines; | ||
1327 | for (;;) | ||
1328 | { | ||
1329 | std::string line; | ||
1330 | if (!getline(wnsafile, line)) | ||
1331 | { | ||
1332 | break; | ||
1333 | } | ||
1334 | |||
1335 | if (line.back() == '\r') | ||
1336 | { | ||
1337 | line.pop_back(); | ||
1338 | } | ||
1339 | |||
1340 | lines.push_back(line); | ||
1341 | } | ||
425 | 1342 | ||
426 | sqlite3_finalize(ppstmt); | 1343 | progress ppgs("Writing specifications...", lines.size()); |
1344 | for (auto line : lines) | ||
1345 | { | ||
1346 | ppgs.update(); | ||
1347 | |||
1348 | std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); | ||
1349 | std::smatch relation_data; | ||
1350 | if (!std::regex_search(line, relation_data, relation)) | ||
1351 | { | ||
1352 | continue; | ||
1353 | } | ||
1354 | |||
1355 | int synset_id_1 = stoi(relation_data[1]); | ||
1356 | int wnum_1 = stoi(relation_data[2]); | ||
1357 | int synset_id_2 = stoi(relation_data[3]); | ||
1358 | int wnum_2 = stoi(relation_data[4]); | ||
1359 | std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); | ||
1360 | |||
1361 | sqlite3_stmt* ppstmt; | ||
1362 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1363 | { | ||
1364 | db_error(ppdb, query); | ||
1365 | } | ||
1366 | |||
1367 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1368 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
427 | 1369 | ||
428 | query = "SELECT last_insert_rowid()"; | 1370 | if (sqlite3_step(ppstmt) != SQLITE_DONE) |
429 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 1371 | { |
1372 | db_error(ppdb, query); | ||
1373 | } | ||
1374 | |||
1375 | sqlite3_finalize(ppstmt); | ||
1376 | } | ||
1377 | } | ||
1378 | /* | ||
1379 | // sim table | ||
1380 | { | ||
1381 | std::ifstream wnsimfile(wnpref + "wn_sim.pl"); | ||
1382 | if (!wnsimfile.is_open()) | ||
430 | { | 1383 | { |
431 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 1384 | std::cout << "Invalid WordNet data directory." << std::endl; |
432 | sqlite3_close_v2(ppdb); | ||
433 | print_usage(); | 1385 | print_usage(); |
434 | } | 1386 | } |
1387 | |||
1388 | std::list<std::string> lines; | ||
1389 | for (;;) | ||
1390 | { | ||
1391 | std::string line; | ||
1392 | if (!getline(wnsimfile, line)) | ||
1393 | { | ||
1394 | break; | ||
1395 | } | ||
1396 | |||
1397 | if (line.back() == '\r') | ||
1398 | { | ||
1399 | line.pop_back(); | ||
1400 | } | ||
1401 | |||
1402 | lines.push_back(line); | ||
1403 | } | ||
435 | 1404 | ||
436 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | 1405 | progress ppgs("Writing sense synonyms...", lines.size()); |
1406 | for (auto line : lines) | ||
1407 | { | ||
1408 | ppgs.update(); | ||
1409 | |||
1410 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); | ||
1411 | std::smatch relation_data; | ||
1412 | if (!std::regex_search(line, relation_data, relation)) | ||
1413 | { | ||
1414 | continue; | ||
1415 | } | ||
1416 | |||
1417 | int synset_id_1 = stoi(relation_data[1]); | ||
1418 | int synset_id_2 = stoi(relation_data[2]); | ||
1419 | std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); | ||
1420 | |||
1421 | for (auto mapping1 : wn[synset_id_1]) | ||
1422 | { | ||
1423 | for (auto mapping2 : wn[synset_id_2]) | ||
1424 | { | ||
1425 | sqlite3_stmt* ppstmt; | ||
1426 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1427 | { | ||
1428 | db_error(ppdb, query); | ||
1429 | } | ||
1430 | |||
1431 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1432 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1433 | |||
1434 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1435 | { | ||
1436 | db_error(ppdb, query); | ||
1437 | } | ||
1438 | |||
1439 | sqlite3_reset(ppstmt); | ||
1440 | sqlite3_clear_bindings(ppstmt); | ||
1441 | |||
1442 | sqlite3_bind_int(ppstmt, 1, mapping2.second); | ||
1443 | sqlite3_bind_int(ppstmt, 2, mapping1.second); | ||
1444 | |||
1445 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1446 | { | ||
1447 | db_error(ppdb, query); | ||
1448 | } | ||
1449 | |||
1450 | sqlite3_finalize(ppstmt); | ||
1451 | } | ||
1452 | } | ||
1453 | } | ||
1454 | } | ||
1455 | */ | ||
1456 | // syntax table | ||
1457 | { | ||
1458 | std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); | ||
1459 | if (!wnsyntaxfile.is_open()) | ||
437 | { | 1460 | { |
438 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 1461 | std::cout << "Invalid WordNet data directory." << std::endl; |
439 | sqlite3_close_v2(ppdb); | ||
440 | print_usage(); | 1462 | print_usage(); |
441 | } | 1463 | } |
1464 | |||
1465 | std::list<std::string> lines; | ||
1466 | for (;;) | ||
1467 | { | ||
1468 | std::string line; | ||
1469 | if (!getline(wnsyntaxfile, line)) | ||
1470 | { | ||
1471 | break; | ||
1472 | } | ||
442 | 1473 | ||
443 | wn[synset_id][wnum] = sqlite3_column_int(ppstmt, 0); | 1474 | if (line.back() == '\r') |
1475 | { | ||
1476 | line.pop_back(); | ||
1477 | } | ||
1478 | |||
1479 | lines.push_back(line); | ||
1480 | } | ||
444 | 1481 | ||
445 | sqlite3_finalize(ppstmt); | 1482 | progress ppgs("Writing adjective syntax markers...", lines.size()); |
1483 | for (auto line : lines) | ||
1484 | { | ||
1485 | ppgs.update(); | ||
1486 | |||
1487 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); | ||
1488 | std::smatch relation_data; | ||
1489 | if (!std::regex_search(line, relation_data, relation)) | ||
1490 | { | ||
1491 | continue; | ||
1492 | } | ||
1493 | |||
1494 | int synset_id = stoi(relation_data[1]); | ||
1495 | int wnum = stoi(relation_data[2]); | ||
1496 | std::string syn = relation_data[3]; | ||
1497 | std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); | ||
1498 | |||
1499 | sqlite3_stmt* ppstmt; | ||
1500 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1501 | { | ||
1502 | db_error(ppdb, query); | ||
1503 | } | ||
1504 | |||
1505 | sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_STATIC); | ||
1506 | sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); | ||
1507 | |||
1508 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1509 | { | ||
1510 | db_error(ppdb, query); | ||
1511 | } | ||
1512 | |||
1513 | sqlite3_finalize(ppstmt); | ||
1514 | } | ||
446 | } | 1515 | } |
447 | 1516 | ||
448 | sqlite3_close_v2(ppdb); | 1517 | sqlite3_close_v2(ppdb); |
449 | 1518 | ||
450 | std::cout << "Done." << std::endl; | 1519 | std::cout << "Done." << std::endl; |
451 | } \ No newline at end of file | 1520 | } |