about summary refs log tree commit diff stats
path: root/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator.cpp')
-rw-r--r--generator.cpp1520
1 files changed, 0 insertions, 1520 deletions
diff --git a/generator.cpp b/generator.cpp deleted file mode 100644 index 305d121..0000000 --- a/generator.cpp +++ /dev/null
@@ -1,1520 +0,0 @@
1#include <libxml/parser.h>
2#include <iostream>
3#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream>
9#include <sqlite3.h>
10#include <sstream>
11#include <regex>
12#include <list>
13#include "progress.h"
14
15struct verb {
16 std::string infinitive;
17 std::string past_tense;
18 std::string past_participle;
19 std::string ing_form;
20 std::string s_form;
21};
22
23struct adjective {
24 std::string base;
25 std::string comparative;
26 std::string superlative;
27};
28
29struct noun {
30 std::string singular;
31 std::string plural;
32};
33
34struct group {
35 std::string id;
36 std::set<std::string> members;
37};
38
39std::map<std::string, group> groups;
40std::map<std::string, verb> verbs;
41std::map<std::string, adjective> adjectives;
42std::map<std::string, noun> nouns;
43std::map<int, std::map<int, int>> wn;
44std::map<std::string, std::set<std::string>> pronunciations;
45
46void print_usage()
47{
48 std::cout << "Verbly Datafile Generator" << std::endl;
49 std::cout << "-------------------------" << std::endl;
50 std::cout << "Requires exactly six arguments." << std::endl;
51 std::cout << "1. The path to a VerbNet data directory." << std::endl;
52 std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl;
53 std::cout << "3. The path to an AGID infl.txt file." << std::endl;
54 std::cout << "4. The path to a WordNet prolog data directory." << std::endl;
55 std::cout << "5. The path to a CMUDICT pronunciation file." << std::endl;
56 std::cout << "6. Datafile output path." << std::endl;
57
58 exit(1);
59}
60
61void db_error(sqlite3* ppdb, std::string)
62{
63 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
64 sqlite3_close_v2(ppdb);
65 print_usage();
66}
67
68/*
69void parse_group(xmlNodePtr top, std::string filename)
70{
71 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
72 if (key == 0)
73 {
74 std::cout << "Bad VerbNet file format: " << filename << std::endl;
75 print_usage();
76 }
77 std::string vnid = key;
78 vnid = vnid.substr(vnid.find_first_of("-")+1);
79 xmlFree(key);
80
81 group g;
82 g.id = vnid;
83
84 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
85 {
86 if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS"))
87 {
88 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
89 {
90 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER"))
91 {
92 key = xmlGetProp(member, (xmlChar*) "name");
93 g.members.insert(key);
94 xmlFree(key);
95 }
96 }
97 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES"))
98 {
99 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next)
100 {
101 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME"))
102 {
103 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
104 {
105
106 }
107 }
108 }
109 }
110 }
111}*/
112
113int main(int argc, char** argv)
114{
115 if (argc != 7)
116 {
117 print_usage();
118 }
119
120 /*DIR* dir;
121 if ((dir = opendir(argv[1])) == nullptr)
122 {
123 std::cout << "Invalid VerbNet data directory." << std::endl;
124
125 print_usage();
126 }
127
128 struct dirent* ent;
129 while ((ent = readdir(dir)) != nullptr)
130 {
131 std::string filename(argv[1]);
132 if (filename.back() != '/')
133 {
134 filename += '/';
135 }
136
137 filename += ent->d_name;
138 //std::cout << ent->d_name << std::endl;
139
140 if (filename.rfind(".xml") != filename.size() - 4)
141 {
142 continue;
143 }
144
145 xmlDocPtr doc = xmlParseFile(filename.c_str());
146 if (doc == nullptr)
147 {
148 std::cout << "Error opening " << filename << std::endl;
149 print_usage();
150 }
151
152 xmlNodePtr top = xmlDocGetRootElement(doc);
153 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
154 {
155 std::cout << "Bad VerbNet file format: " << filename << std::endl;
156 print_usage();
157 }
158
159 parse_group(top, filename);
160 }
161
162 closedir(dir);*/
163
164 // Get verbs from AGID
165 std::cout << "Reading inflections..." << std::endl;
166
167 std::ifstream agidfile(argv[3]);
168 if (!agidfile.is_open())
169 {
170 std::cout << "Could not open AGID file: " << argv[3] << std::endl;
171 print_usage();
172 }
173
174 for (;;)
175 {
176 std::string line;
177 if (!getline(agidfile, line))
178 {
179 break;
180 }
181
182 if (line.back() == '\r')
183 {
184 line.pop_back();
185 }
186
187 int divider = line.find_first_of(" ");
188 std::string word = line.substr(0, divider);
189 line = line.substr(divider+1);
190 char type = line[0];
191
192 if (line[1] == '?')
193 {
194 line.erase(0, 4);
195 } else {
196 line.erase(0, 3);
197 }
198
199 std::vector<std::string> forms;
200 while (!line.empty())
201 {
202 std::string inflection;
203 if ((divider = line.find(" | ")) != std::string::npos)
204 {
205 inflection = line.substr(0, divider);
206 line = line.substr(divider + 3);
207 } else {
208 inflection = line;
209 line = "";
210 }
211
212 if ((divider = inflection.find_first_of(",?")) != std::string::npos)
213 {
214 inflection = inflection.substr(0, divider);
215 }
216
217 forms.push_back(inflection);
218 }
219
220 switch (type)
221 {
222 case 'V':
223 {
224 verb v;
225 v.infinitive = word;
226 if (forms.size() == 4)
227 {
228 v.past_tense = forms[0];
229 v.past_participle = forms[1];
230 v.ing_form = forms[2];
231 v.s_form = forms[3];
232 } else if (forms.size() == 3)
233 {
234 v.past_tense = forms[0];
235 v.past_participle = forms[0];
236 v.ing_form = forms[1];
237 v.s_form = forms[2];
238 } else if (forms.size() == 8)
239 {
240 // As of AGID 2014.08.11, this is only "to be"
241 v.past_tense = forms[0];
242 v.past_participle = forms[2];
243 v.ing_form = forms[3];
244 v.s_form = forms[4];
245 } else {
246 // Words that don't fit the cases above as of AGID 2014.08.11:
247 // - may and shall do not conjugate the way we want them to
248 // - methinks only has a past tense and is an outlier
249 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
250 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
251 }
252
253 verbs[word] = v;
254
255 break;
256 }
257
258 case 'A':
259 {
260 adjective adj;
261 adj.base = word;
262 if (forms.size() == 2)
263 {
264 adj.comparative = forms[0];
265 adj.superlative = forms[1];
266 } else {
267 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
268 std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl;
269 }
270
271 adjectives[word] = adj;
272
273 break;
274 }
275
276 case 'N':
277 {
278 noun n;
279 n.singular = word;
280 if (forms.size() == 1)
281 {
282 n.plural = forms[0];
283 } else {
284 // As of AGID 2014.08.11, this is non-existent.
285 std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl;
286 }
287
288 nouns[word] = n;
289
290 break;
291 }
292 }
293 }
294
295 // Pronounciations
296 std::cout << "Reading pronunciations..." << std::endl;
297
298 std::ifstream pronfile(argv[5]);
299 if (!pronfile.is_open())
300 {
301 std::cout << "Could not open CMUDICT file: " << argv[5] << std::endl;
302 print_usage();
303 }
304
305 for (;;)
306 {
307 std::string line;
308 if (!getline(pronfile, line))
309 {
310 break;
311 }
312
313 if (line.back() == '\r')
314 {
315 line.pop_back();
316 }
317
318 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
319 std::smatch phoneme_data;
320 if (std::regex_search(line, phoneme_data, phoneme))
321 {
322 std::string canonical(phoneme_data[1]);
323 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
324
325 pronunciations[canonical].insert(phoneme_data[2]);
326 }
327 }
328
329 // Start writing output
330 std::cout << "Writing schema..." << std::endl;
331
332 sqlite3* ppdb;
333 if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
334 {
335 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
336 print_usage();
337 }
338
339 std::ifstream schemafile("schema.sql");
340 if (!schemafile.is_open())
341 {
342 std::cout << "Could not find schema file" << std::endl;
343 print_usage();
344 }
345
346 std::stringstream schemabuilder;
347 for (;;)
348 {
349 std::string line;
350 if (!getline(schemafile, line))
351 {
352 break;
353 }
354
355 if (line.back() == '\r')
356 {
357 line.pop_back();
358 }
359
360 schemabuilder << line << std::endl;
361 }
362
363 std::string schema = schemabuilder.str();
364 while (!schema.empty())
365 {
366 std::string query;
367 int divider = schema.find(";");
368 if (divider != std::string::npos)
369 {
370 query = schema.substr(0, divider+1);
371 schema = schema.substr(divider+2);
372 } else {
373 break;
374 }
375
376 sqlite3_stmt* schmstmt;
377 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
378 {
379 db_error(ppdb, query);
380 }
381
382 if (sqlite3_step(schmstmt) != SQLITE_DONE)
383 {
384 db_error(ppdb, query);
385 }
386
387 sqlite3_finalize(schmstmt);
388 }
389
390 {
391 progress ppgs("Writing verbs...", verbs.size());
392 for (auto& mapping : verbs)
393 {
394 sqlite3_stmt* ppstmt;
395 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
396 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
397 {
398 db_error(ppdb, query);
399 }
400
401 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC);
402 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC);
403 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC);
404 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC);
405 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC);
406
407 if (sqlite3_step(ppstmt) != SQLITE_DONE)
408 {
409 db_error(ppdb, query);
410 }
411
412 sqlite3_finalize(ppstmt);
413
414 std::string canonical(mapping.second.infinitive);
415 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
416 if (pronunciations.count(canonical) == 1)
417 {
418 query = "SELECT last_insert_rowid()";
419 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
420 {
421 db_error(ppdb, query);
422 }
423
424 if (sqlite3_step(ppstmt) != SQLITE_ROW)
425 {
426 db_error(ppdb, query);
427 }
428
429 int rowid = sqlite3_column_int(ppstmt, 0);
430
431 sqlite3_finalize(ppstmt);
432
433 for (auto pronunciation : pronunciations[canonical])
434 {
435 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)";
436 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
437 {
438 db_error(ppdb, query);
439 }
440
441 sqlite3_bind_int(ppstmt, 1, rowid);
442 sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC);
443
444 if (sqlite3_step(ppstmt) != SQLITE_DONE)
445 {
446 db_error(ppdb, query);
447 }
448
449 sqlite3_finalize(ppstmt);
450 }
451 }
452
453 ppgs.update();
454 }
455 }
456
457 // Get nouns/adjectives/adverbs from WordNet
458 // Useful relations:
459 // - s: master list
460 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
461 // - at: variation (e.g. a measurement can be standard or nonstandard)
462 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
463 // - ins: instantiation (do we need this? let's see)
464 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
465 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
466 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
467 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
468 // mannernymy (e.g. something done quickly is done in a manner that is quick)
469 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
470 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
471 // - syntax: positioning flags for some adjectives
472 std::string wnpref {argv[4]};
473 if (wnpref.back() != '/')
474 {
475 wnpref += '/';
476 }
477
478 // s table
479 {
480 std::ifstream wnsfile(wnpref + "wn_s.pl");
481 if (!wnsfile.is_open())
482 {
483 std::cout << "Invalid WordNet data directory." << std::endl;
484 print_usage();
485 }
486
487 std::list<std::string> lines;
488 for (;;)
489 {
490 std::string line;
491 if (!getline(wnsfile, line))
492 {
493 break;
494 }
495
496 if (line.back() == '\r')
497 {
498 line.pop_back();
499 }
500
501 lines.push_back(line);
502 }
503
504 progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size());
505 for (auto line : lines)
506 {
507 ppgs.update();
508
509 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',");
510 std::smatch relation_data;
511 if (!std::regex_search(line, relation_data, relation))
512 {
513 continue;
514 }
515
516 int synset_id = stoi(relation_data[1]);
517 int wnum = stoi(relation_data[2]);
518 std::string word = relation_data[3];
519
520 std::string query;
521 switch (synset_id / 100000000)
522 {
523 case 1: // Noun
524 {
525 if (nouns.count(word) == 1)
526 {
527 query = "INSERT INTO nouns (singular, plural) VALUES (?, ?)";
528 } else {
529 query = "INSERT INTO nouns (singular) VALUES (?)";
530 }
531
532 break;
533 }
534
535 case 2: // Verb
536 {
537 // Ignore
538
539 break;
540 }
541
542 case 3: // Adjective
543 {
544 if (adjectives.count(word) == 1)
545 {
546 query = "INSERT INTO adjectives (base_form, comparative, superlative) VALUES (?, ?, ?)";
547 } else {
548 query = "INSERT INTO adjectives (base_form) VALUES (?)";
549 }
550
551 break;
552 }
553
554 case 4: // Adverb
555 {
556 if (adjectives.count(word) == 1)
557 {
558 query = "INSERT INTO adverbs (base_form, comparative, superlative) VALUES (?, ?, ?)";
559 } else {
560 query = "INSERT INTO adverbs (base_form) VALUES (?)";
561 }
562
563 break;
564 }
565 }
566
567 sqlite3_stmt* ppstmt;
568 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
569 {
570 db_error(ppdb, query);
571 }
572
573 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC);
574 switch (synset_id / 100000000)
575 {
576 case 1: // Noun
577 {
578 if (nouns.count(word) == 1)
579 {
580 sqlite3_bind_text(ppstmt, 2, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_STATIC);
581 }
582
583 break;
584 }
585
586 case 3: // Adjective
587 case 4: // Adverb
588 {
589 if (adjectives.count(word) == 1)
590 {
591 sqlite3_bind_text(ppstmt, 2, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_STATIC);
592 sqlite3_bind_text(ppstmt, 3, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_STATIC);
593 }
594
595 break;
596 }
597 }
598
599 if (sqlite3_step(ppstmt) != SQLITE_DONE)
600 {
601 db_error(ppdb, query);
602 }
603
604 sqlite3_finalize(ppstmt);
605
606 query = "SELECT last_insert_rowid()";
607 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
608 {
609 db_error(ppdb, query);
610 }
611
612 if (sqlite3_step(ppstmt) != SQLITE_ROW)
613 {
614 db_error(ppdb, query);
615 }
616
617 int rowid = sqlite3_column_int(ppstmt, 0);
618 wn[synset_id][wnum] = rowid;
619
620 sqlite3_finalize(ppstmt);
621
622 std::string canonical(word);
623 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
624 if (pronunciations.count(canonical) == 1)
625 {
626 for (auto pronunciation : pronunciations[canonical])
627 {
628 switch (synset_id / 100000000)
629 {
630 case 1: // Noun
631 {
632 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation) VALUES (?, ?)";
633
634 break;
635 }
636
637 case 3: // Adjective
638 {
639 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation) VALUES (?, ?)";
640
641 break;
642 }
643
644 case 4: // Adverb
645 {
646 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation) VALUES (?, ?)";
647
648 break;
649 }
650 }
651
652 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
653 {
654 db_error(ppdb, query);
655 }
656
657 sqlite3_bind_int(ppstmt, 1, rowid);
658 sqlite3_bind_text(ppstmt, 2, pronunciation.c_str(), pronunciation.length(), SQLITE_STATIC);
659
660 if (sqlite3_step(ppstmt) != SQLITE_DONE)
661 {
662 db_error(ppdb, query);
663 }
664
665 sqlite3_finalize(ppstmt);
666 }
667 }
668 }
669 }
670
671 // While we're working on s
672 {
673 progress ppgs("Writing word synonyms...", wn.size());
674 for (auto sense : wn)
675 {
676 ppgs.update();
677
678 for (auto word1 : sense.second)
679 {
680 for (auto word2 : sense.second)
681 {
682 if (word1 != word2)
683 {
684 std::string query;
685 switch (sense.first / 100000000)
686 {
687 case 1: // Noun
688 {
689 query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
690
691 break;
692 }
693
694 case 2: // Verb
695 {
696 // Ignore
697
698 break;
699 }
700
701 case 3: // Adjective
702 {
703 query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
704
705 break;
706 }
707
708 case 4: // Adverb
709 {
710 query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
711
712 break;
713 }
714 }
715
716 sqlite3_stmt* ppstmt;
717 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
718 {
719 db_error(ppdb, query);
720 }
721
722 sqlite3_bind_int(ppstmt, 1, word1.second);
723 sqlite3_bind_int(ppstmt, 2, word2.second);
724
725 if (sqlite3_step(ppstmt) != SQLITE_DONE)
726 {
727 db_error(ppdb, query);
728 }
729
730 sqlite3_finalize(ppstmt);
731 }
732 }
733 }
734 }
735 }
736
737 // ant table
738 {
739 std::ifstream wnantfile(wnpref + "wn_ant.pl");
740 if (!wnantfile.is_open())
741 {
742 std::cout << "Invalid WordNet data directory." << std::endl;
743 print_usage();
744 }
745
746 std::list<std::string> lines;
747 for (;;)
748 {
749 std::string line;
750 if (!getline(wnantfile, line))
751 {
752 break;
753 }
754
755 if (line.back() == '\r')
756 {
757 line.pop_back();
758 }
759
760 lines.push_back(line);
761 }
762
763 progress ppgs("Writing antonyms...", lines.size());
764 for (auto line : lines)
765 {
766 ppgs.update();
767
768 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
769 std::smatch relation_data;
770 if (!std::regex_search(line, relation_data, relation))
771 {
772 continue;
773 }
774
775 int synset_id_1 = stoi(relation_data[1]);
776 int wnum_1 = stoi(relation_data[2]);
777 int synset_id_2 = stoi(relation_data[3]);
778 int wnum_2 = stoi(relation_data[4]);
779
780 std::string query;
781 switch (synset_id_1 / 100000000)
782 {
783 case 1: // Noun
784 {
785 query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
786
787 break;
788 }
789
790 case 2: // Verb
791 {
792 // Ignore
793
794 break;
795 }
796
797 case 3: // Adjective
798 {
799 query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
800
801 break;
802 }
803
804 case 4: // Adverb
805 {
806 query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
807
808 break;
809 }
810 }
811
812 sqlite3_stmt* ppstmt;
813 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
814 {
815 db_error(ppdb, query);
816 }
817
818 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
819 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
820
821 if (sqlite3_step(ppstmt) != SQLITE_DONE)
822 {
823 db_error(ppdb, query);
824 }
825
826 sqlite3_finalize(ppstmt);
827 }
828 }
829
830 // at table
831 {
832 std::ifstream wnatfile(wnpref + "wn_at.pl");
833 if (!wnatfile.is_open())
834 {
835 std::cout << "Invalid WordNet data directory." << std::endl;
836 print_usage();
837 }
838
839 std::list<std::string> lines;
840 for (;;)
841 {
842 std::string line;
843 if (!getline(wnatfile, line))
844 {
845 break;
846 }
847
848 if (line.back() == '\r')
849 {
850 line.pop_back();
851 }
852
853 lines.push_back(line);
854 }
855
856 progress ppgs("Writing variations...", lines.size());
857 for (auto line : lines)
858 {
859 ppgs.update();
860
861 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
862 std::smatch relation_data;
863 if (!std::regex_search(line, relation_data, relation))
864 {
865 continue;
866 }
867
868 int synset_id_1 = stoi(relation_data[1]);
869 int synset_id_2 = stoi(relation_data[2]);
870 std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)");
871
872 for (auto mapping1 : wn[synset_id_1])
873 {
874 for (auto mapping2 : wn[synset_id_2])
875 {
876 sqlite3_stmt* ppstmt;
877 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
878 {
879 db_error(ppdb, query);
880 }
881
882 sqlite3_bind_int(ppstmt, 1, mapping1.second);
883 sqlite3_bind_int(ppstmt, 2, mapping2.second);
884
885 if (sqlite3_step(ppstmt) != SQLITE_DONE)
886 {
887 db_error(ppdb, query);
888 }
889
890 sqlite3_finalize(ppstmt);
891 }
892 }
893 }
894 }
895
896 // hyp table
897 {
898 std::ifstream wnhypfile(wnpref + "wn_hyp.pl");
899 if (!wnhypfile.is_open())
900 {
901 std::cout << "Invalid WordNet data directory." << std::endl;
902 print_usage();
903 }
904
905 std::list<std::string> lines;
906 for (;;)
907 {
908 std::string line;
909 if (!getline(wnhypfile, line))
910 {
911 break;
912 }
913
914 if (line.back() == '\r')
915 {
916 line.pop_back();
917 }
918
919 lines.push_back(line);
920 }
921
922 progress ppgs("Writing hypernyms...", lines.size());
923 for (auto line : lines)
924 {
925 ppgs.update();
926
927 std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\.");
928 std::smatch relation_data;
929 if (!std::regex_search(line, relation_data, relation))
930 {
931 continue;
932 }
933
934 int synset_id_1 = stoi(relation_data[1]);
935 int synset_id_2 = stoi(relation_data[2]);
936 std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)");
937
938 for (auto mapping1 : wn[synset_id_1])
939 {
940 for (auto mapping2 : wn[synset_id_2])
941 {
942 sqlite3_stmt* ppstmt;
943 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
944 {
945 db_error(ppdb, query);
946 }
947
948 sqlite3_bind_int(ppstmt, 1, mapping1.second);
949 sqlite3_bind_int(ppstmt, 2, mapping2.second);
950
951 if (sqlite3_step(ppstmt) != SQLITE_DONE)
952 {
953 db_error(ppdb, query);
954 }
955
956 sqlite3_finalize(ppstmt);
957 }
958 }
959 }
960 }
961
962 // ins table
963 {
964 std::ifstream wninsfile(wnpref + "wn_ins.pl");
965 if (!wninsfile.is_open())
966 {
967 std::cout << "Invalid WordNet data directory." << std::endl;
968 print_usage();
969 }
970
971 std::list<std::string> lines;
972 for (;;)
973 {
974 std::string line;
975 if (!getline(wninsfile, line))
976 {
977 break;
978 }
979
980 if (line.back() == '\r')
981 {
982 line.pop_back();
983 }
984
985 lines.push_back(line);
986 }
987
988 progress ppgs("Writing instantiations...", lines.size());
989 for (auto line : lines)
990 {
991 ppgs.update();
992
993 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
994 std::smatch relation_data;
995 if (!std::regex_search(line, relation_data, relation))
996 {
997 continue;
998 }
999
1000 int synset_id_1 = stoi(relation_data[1]);
1001 int synset_id_2 = stoi(relation_data[2]);
1002 std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)");
1003
1004 for (auto mapping1 : wn[synset_id_1])
1005 {
1006 for (auto mapping2 : wn[synset_id_2])
1007 {
1008 sqlite3_stmt* ppstmt;
1009 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1010 {
1011 db_error(ppdb, query);
1012 }
1013
1014 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1015 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1016
1017 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1018 {
1019 db_error(ppdb, query);
1020 }
1021
1022 sqlite3_finalize(ppstmt);
1023 }
1024 }
1025 }
1026 }
1027
1028 // mm table
1029 {
1030 std::ifstream wnmmfile(wnpref + "wn_mm.pl");
1031 if (!wnmmfile.is_open())
1032 {
1033 std::cout << "Invalid WordNet data directory." << std::endl;
1034 print_usage();
1035 }
1036
1037 std::list<std::string> lines;
1038 for (;;)
1039 {
1040 std::string line;
1041 if (!getline(wnmmfile, line))
1042 {
1043 break;
1044 }
1045
1046 if (line.back() == '\r')
1047 {
1048 line.pop_back();
1049 }
1050
1051 lines.push_back(line);
1052 }
1053
1054 progress ppgs("Writing member meronyms...", lines.size());
1055 for (auto line : lines)
1056 {
1057 ppgs.update();
1058
1059 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
1060 std::smatch relation_data;
1061 if (!std::regex_search(line, relation_data, relation))
1062 {
1063 continue;
1064 }
1065
1066 int synset_id_1 = stoi(relation_data[1]);
1067 int synset_id_2 = stoi(relation_data[2]);
1068 std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1069
1070 for (auto mapping1 : wn[synset_id_1])
1071 {
1072 for (auto mapping2 : wn[synset_id_2])
1073 {
1074 sqlite3_stmt* ppstmt;
1075 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1076 {
1077 db_error(ppdb, query);
1078 }
1079
1080 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1081 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1082
1083 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1084 {
1085 db_error(ppdb, query);
1086 }
1087
1088 sqlite3_finalize(ppstmt);
1089 }
1090 }
1091 }
1092 }
1093
1094 // ms table
1095 {
1096 std::ifstream wnmsfile(wnpref + "wn_ms.pl");
1097 if (!wnmsfile.is_open())
1098 {
1099 std::cout << "Invalid WordNet data directory." << std::endl;
1100 print_usage();
1101 }
1102
1103 std::list<std::string> lines;
1104 for (;;)
1105 {
1106 std::string line;
1107 if (!getline(wnmsfile, line))
1108 {
1109 break;
1110 }
1111
1112 if (line.back() == '\r')
1113 {
1114 line.pop_back();
1115 }
1116
1117 lines.push_back(line);
1118 }
1119
1120 progress ppgs("Writing substance meronyms...", lines.size());
1121 for (auto line : lines)
1122 {
1123 ppgs.update();
1124
1125 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
1126 std::smatch relation_data;
1127 if (!std::regex_search(line, relation_data, relation))
1128 {
1129 continue;
1130 }
1131
1132 int synset_id_1 = stoi(relation_data[1]);
1133 int synset_id_2 = stoi(relation_data[2]);
1134 std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1135
1136 for (auto mapping1 : wn[synset_id_1])
1137 {
1138 for (auto mapping2 : wn[synset_id_2])
1139 {
1140 sqlite3_stmt* ppstmt;
1141 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1142 {
1143 db_error(ppdb, query);
1144 }
1145
1146 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1147 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1148
1149 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1150 {
1151 db_error(ppdb, query);
1152 }
1153
1154 sqlite3_finalize(ppstmt);
1155 }
1156 }
1157 }
1158 }
1159
1160 // mm table
1161 {
1162 std::ifstream wnmpfile(wnpref + "wn_mp.pl");
1163 if (!wnmpfile.is_open())
1164 {
1165 std::cout << "Invalid WordNet data directory." << std::endl;
1166 print_usage();
1167 }
1168
1169 std::list<std::string> lines;
1170 for (;;)
1171 {
1172 std::string line;
1173 if (!getline(wnmpfile, line))
1174 {
1175 break;
1176 }
1177
1178 if (line.back() == '\r')
1179 {
1180 line.pop_back();
1181 }
1182
1183 lines.push_back(line);
1184 }
1185
1186 progress ppgs("Writing part meronyms...", lines.size());
1187 for (auto line : lines)
1188 {
1189 ppgs.update();
1190
1191 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
1192 std::smatch relation_data;
1193 if (!std::regex_search(line, relation_data, relation))
1194 {
1195 continue;
1196 }
1197
1198 int synset_id_1 = stoi(relation_data[1]);
1199 int synset_id_2 = stoi(relation_data[2]);
1200 std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1201
1202 for (auto mapping1 : wn[synset_id_1])
1203 {
1204 for (auto mapping2 : wn[synset_id_2])
1205 {
1206 sqlite3_stmt* ppstmt;
1207 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1208 {
1209 db_error(ppdb, query);
1210 }
1211
1212 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1213 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1214
1215 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1216 {
1217 db_error(ppdb, query);
1218 }
1219
1220 sqlite3_finalize(ppstmt);
1221 }
1222 }
1223 }
1224 }
1225
1226 // per table
1227 {
1228 std::ifstream wnperfile(wnpref + "wn_per.pl");
1229 if (!wnperfile.is_open())
1230 {
1231 std::cout << "Invalid WordNet data directory." << std::endl;
1232 print_usage();
1233 }
1234
1235 std::list<std::string> lines;
1236 for (;;)
1237 {
1238 std::string line;
1239 if (!getline(wnperfile, line))
1240 {
1241 break;
1242 }
1243
1244 if (line.back() == '\r')
1245 {
1246 line.pop_back();
1247 }
1248
1249 lines.push_back(line);
1250 }
1251
1252 progress ppgs("Writing pertainyms and mannernyms...", lines.size());
1253 for (auto line : lines)
1254 {
1255 ppgs.update();
1256
1257 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
1258 std::smatch relation_data;
1259 if (!std::regex_search(line, relation_data, relation))
1260 {
1261 continue;
1262 }
1263
1264 int synset_id_1 = stoi(relation_data[1]);
1265 int wnum_1 = stoi(relation_data[2]);
1266 int synset_id_2 = stoi(relation_data[3]);
1267 int wnum_2 = stoi(relation_data[4]);
1268 std::string query;
1269 switch (synset_id_1 / 100000000)
1270 {
1271 case 3: // Adjective
1272 {
1273 // This is a pertainym, the second word should be a noun
1274 // Technically it can be an adjective but we're ignoring that
1275 if (synset_id_2 / 100000000 != 1)
1276 {
1277 continue;
1278 }
1279
1280 query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)";
1281
1282 break;
1283 }
1284
1285 case 4: // Adverb
1286 {
1287 // This is a mannernym, the second word should be an adjective
1288 if (synset_id_2 / 100000000 != 3)
1289 {
1290 continue;
1291 }
1292
1293 query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)";
1294
1295 break;
1296 }
1297 }
1298
1299 sqlite3_stmt* ppstmt;
1300 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1301 {
1302 db_error(ppdb, query);
1303 }
1304
1305 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1306 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1307
1308 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1309 {
1310 db_error(ppdb, query);
1311 }
1312
1313 sqlite3_finalize(ppstmt);
1314 }
1315 }
1316
1317 // sa table
1318 {
1319 std::ifstream wnsafile(wnpref + "wn_sa.pl");
1320 if (!wnsafile.is_open())
1321 {
1322 std::cout << "Invalid WordNet data directory." << std::endl;
1323 print_usage();
1324 }
1325
1326 std::list<std::string> lines;
1327 for (;;)
1328 {
1329 std::string line;
1330 if (!getline(wnsafile, line))
1331 {
1332 break;
1333 }
1334
1335 if (line.back() == '\r')
1336 {
1337 line.pop_back();
1338 }
1339
1340 lines.push_back(line);
1341 }
1342
1343 progress ppgs("Writing specifications...", lines.size());
1344 for (auto line : lines)
1345 {
1346 ppgs.update();
1347
1348 std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\.");
1349 std::smatch relation_data;
1350 if (!std::regex_search(line, relation_data, relation))
1351 {
1352 continue;
1353 }
1354
1355 int synset_id_1 = stoi(relation_data[1]);
1356 int wnum_1 = stoi(relation_data[2]);
1357 int synset_id_2 = stoi(relation_data[3]);
1358 int wnum_2 = stoi(relation_data[4]);
1359 std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)");
1360
1361 sqlite3_stmt* ppstmt;
1362 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1363 {
1364 db_error(ppdb, query);
1365 }
1366
1367 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1368 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1369
1370 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1371 {
1372 db_error(ppdb, query);
1373 }
1374
1375 sqlite3_finalize(ppstmt);
1376 }
1377 }
1378 /*
1379 // sim table
1380 {
1381 std::ifstream wnsimfile(wnpref + "wn_sim.pl");
1382 if (!wnsimfile.is_open())
1383 {
1384 std::cout << "Invalid WordNet data directory." << std::endl;
1385 print_usage();
1386 }
1387
1388 std::list<std::string> lines;
1389 for (;;)
1390 {
1391 std::string line;
1392 if (!getline(wnsimfile, line))
1393 {
1394 break;
1395 }
1396
1397 if (line.back() == '\r')
1398 {
1399 line.pop_back();
1400 }
1401
1402 lines.push_back(line);
1403 }
1404
1405 progress ppgs("Writing sense synonyms...", lines.size());
1406 for (auto line : lines)
1407 {
1408 ppgs.update();
1409
1410 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
1411 std::smatch relation_data;
1412 if (!std::regex_search(line, relation_data, relation))
1413 {
1414 continue;
1415 }
1416
1417 int synset_id_1 = stoi(relation_data[1]);
1418 int synset_id_2 = stoi(relation_data[2]);
1419 std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)");
1420
1421 for (auto mapping1 : wn[synset_id_1])
1422 {
1423 for (auto mapping2 : wn[synset_id_2])
1424 {
1425 sqlite3_stmt* ppstmt;
1426 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1427 {
1428 db_error(ppdb, query);
1429 }
1430
1431 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1432 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1433
1434 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1435 {
1436 db_error(ppdb, query);
1437 }
1438
1439 sqlite3_reset(ppstmt);
1440 sqlite3_clear_bindings(ppstmt);
1441
1442 sqlite3_bind_int(ppstmt, 1, mapping2.second);
1443 sqlite3_bind_int(ppstmt, 2, mapping1.second);
1444
1445 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1446 {
1447 db_error(ppdb, query);
1448 }
1449
1450 sqlite3_finalize(ppstmt);
1451 }
1452 }
1453 }
1454 }
1455 */
1456 // syntax table
1457 {
1458 std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl");
1459 if (!wnsyntaxfile.is_open())
1460 {
1461 std::cout << "Invalid WordNet data directory." << std::endl;
1462 print_usage();
1463 }
1464
1465 std::list<std::string> lines;
1466 for (;;)
1467 {
1468 std::string line;
1469 if (!getline(wnsyntaxfile, line))
1470 {
1471 break;
1472 }
1473
1474 if (line.back() == '\r')
1475 {
1476 line.pop_back();
1477 }
1478
1479 lines.push_back(line);
1480 }
1481
1482 progress ppgs("Writing adjective syntax markers...", lines.size());
1483 for (auto line : lines)
1484 {
1485 ppgs.update();
1486
1487 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
1488 std::smatch relation_data;
1489 if (!std::regex_search(line, relation_data, relation))
1490 {
1491 continue;
1492 }
1493
1494 int synset_id = stoi(relation_data[1]);
1495 int wnum = stoi(relation_data[2]);
1496 std::string syn = relation_data[3];
1497 std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?");
1498
1499 sqlite3_stmt* ppstmt;
1500 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1501 {
1502 db_error(ppdb, query);
1503 }
1504
1505 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_STATIC);
1506 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]);
1507
1508 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1509 {
1510 db_error(ppdb, query);
1511 }
1512
1513 sqlite3_finalize(ppstmt);
1514 }
1515 }
1516
1517 sqlite3_close_v2(ppdb);
1518
1519 std::cout << "Done." << std::endl;
1520}