summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp3145
1 files changed, 1151 insertions, 1994 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,2320 +1,1477 @@
1#include <libxml/parser.h> 1#include "generator.h"
2#include <cassert>
3#include <stdexcept>
2#include <iostream> 4#include <iostream>
5#include <regex>
3#include <dirent.h> 6#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream> 7#include <fstream>
9#include <sqlite3.h> 8#include "enums.h"
10#include <sstream>
11#include <regex>
12#include <list>
13#include <algorithm>
14#include <json.hpp>
15#include "progress.h" 9#include "progress.h"
10#include "selrestr.h"
11#include "role.h"
12#include "part.h"
13#include "field.h"
16#include "../lib/util.h" 14#include "../lib/util.h"
17 15
18using json = nlohmann::json; 16namespace verbly {
19 17 namespace generator {
20struct verb_t {
21 std::string infinitive;
22 std::string past_tense;
23 std::string past_participle;
24 std::string ing_form;
25 std::string s_form;
26 int id;
27};
28
29struct adjective_t {
30 std::string base;
31 std::string comparative;
32 std::string superlative;
33};
34
35struct noun_t {
36 std::string singular;
37 std::string plural;
38};
39
40struct selrestr_t {
41 enum class type_t {
42 singleton,
43 andlogic,
44 orlogic,
45 empty
46 };
47 type_t type;
48 std::string restriction;
49 bool pos;
50 std::list<selrestr_t> subordinates;
51};
52
53struct framepart_t {
54 enum class type_t {
55 np,
56 v,
57 pp,
58 adj,
59 adv,
60 lex
61 };
62 type_t type;
63 std::string role;
64 selrestr_t selrestrs;
65 std::set<std::string> preprestrs;
66 std::set<std::string> synrestrs;
67 std::list<std::string> choices;
68 std::string lexval;
69};
70
71struct group_t {
72 std::string id;
73 std::string parent;
74 std::set<std::string> members;
75 std::map<std::string, selrestr_t> roles;
76 std::list<std::list<framepart_t>> frames;
77};
78
79struct pronunciation_t {
80 std::string phonemes;
81 std::string prerhyme;
82 std::string rhyme;
83 int syllables = 0;
84 std::string stress;
85
86 bool operator<(const pronunciation_t& other) const
87 {
88 return phonemes < other.phonemes;
89 }
90};
91
92std::map<std::string, group_t> groups;
93std::map<std::string, verb_t> verbs;
94std::map<std::string, adjective_t> adjectives;
95std::map<std::string, noun_t> nouns;
96std::map<int, std::map<int, int>> wn;
97std::map<int, int> images;
98std::map<std::string, std::set<pronunciation_t>> pronunciations;
99
100void print_usage()
101{
102 std::cout << "Verbly Datafile Generator" << std::endl;
103 std::cout << "-------------------------" << std::endl;
104 std::cout << "Requires exactly six arguments." << std::endl;
105 std::cout << "1. The path to a VerbNet data directory." << std::endl;
106 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
107 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
108 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
109 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
110 std::cout << "6. Datafile output path." << std::endl;
111
112 exit(1);
113}
114
115void db_error(sqlite3* ppdb, std::string query)
116{
117 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
118 std::cout << query << std::endl;
119 sqlite3_close_v2(ppdb);
120 print_usage();
121}
122
123json export_selrestrs(selrestr_t r)
124{
125 if (r.type == selrestr_t::type_t::empty)
126 {
127 return {};
128 } else if (r.type == selrestr_t::type_t::singleton)
129 {
130 json result;
131 result["type"] = r.restriction;
132 result["pos"] = r.pos;
133 return result;
134 } else {
135 json result;
136 if (r.type == selrestr_t::type_t::andlogic)
137 {
138 result["logic"] = "and";
139 } else {
140 result["logic"] = "or";
141 }
142
143 std::list<json> outlist;
144 std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs);
145 result["children"] = outlist;
146 18
147 return result; 19 generator::generator(
148 } 20 std::string verbNetPath,
149} 21 std::string agidPath,
150 22 std::string wordNetPath,
151selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) 23 std::string cmudictPath,
152{ 24 std::string imageNetPath,
153 selrestr_t r; 25 std::string outputPath) :
154 xmlChar* key; 26 verbNetPath_(verbNetPath),
155 27 agidPath_(agidPath),
156 if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) 28 wordNetPath_(wordNetPath),
157 { 29 cmudictPath_(cmudictPath),
158 if (xmlChildElementCount(top) == 0) 30 imageNetPath_(imageNetPath),
31 db_(outputPath)
159 { 32 {
160 r.type = selrestr_t::type_t::empty; 33 // Ensure VerbNet directory exists
161 } else if (xmlChildElementCount(top) == 1) 34 DIR* dir;
162 { 35 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
163 r = parse_selrestrs(xmlFirstElementChild(top), filename);
164 } else {
165 r.type = selrestr_t::type_t::andlogic;
166
167 if (xmlHasProp(top, (const xmlChar*) "logic"))
168 { 36 {
169 key = xmlGetProp(top, (const xmlChar*) "logic"); 37 throw std::invalid_argument("Invalid VerbNet data directory");
170 if (!xmlStrcmp(key, (const xmlChar*) "or"))
171 {
172 r.type = selrestr_t::type_t::orlogic;
173 }
174 xmlFree(key);
175 } 38 }
176 39
177 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) 40 closedir(dir);
41
42 // Ensure AGID infl.txt exists
43 if (!std::ifstream(agidPath_))
178 { 44 {
179 if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) 45 throw std::invalid_argument("AGID infl.txt file not found");
180 {
181 r.subordinates.push_back(parse_selrestrs(selrestr, filename));
182 }
183 } 46 }
184 } 47
185 } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) 48 // Add directory separator to WordNet path
186 { 49 if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\'))
187 r.type = selrestr_t::type_t::singleton;
188
189 key = xmlGetProp(top, (xmlChar*) "Value");
190 r.pos = (std::string((const char*)key) == "+");
191 xmlFree(key);
192
193 key = xmlGetProp(top, (xmlChar*) "type");
194 r.restriction = (const char*) key;
195 xmlFree(key);
196 } else {
197 // Invalid
198 std::cout << "Bad VerbNet file format: " << filename << std::endl;
199 print_usage();
200 }
201
202 return r;
203}
204
205group_t& parse_group(xmlNodePtr top, std::string filename)
206{
207 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
208 if (key == 0)
209 {
210 std::cout << "Bad VerbNet file format: " << filename << std::endl;
211 print_usage();
212 }
213 std::string vnid = (const char*)key;
214 vnid = vnid.substr(vnid.find_first_of("-")+1);
215 xmlFree(key);
216
217 group_t g;
218 g.id = vnid;
219
220 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
221 {
222 if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES"))
223 {
224 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
225 { 50 {
226 if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) 51 wordNetPath_ += '/';
227 {
228 auto& sg = parse_group(subclass, filename);
229 sg.parent = vnid;
230
231 for (auto member : sg.members)
232 {
233 g.members.insert(member);
234 }
235
236 // The schema requires that subclasses appear after role definitions, so we can do this now
237 for (auto role : g.roles)
238 {
239 if (sg.roles.count(role.first) == 0)
240 {
241 sg.roles[role.first] = role.second;
242 }
243 }
244 }
245 } 52 }
246 } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) 53
247 { 54 // Ensure WordNet tables exist
248 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) 55 for (std::string table : {
56 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"
57 })
249 { 58 {
250 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) 59 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl"))
251 { 60 {
252 key = xmlGetProp(member, (xmlChar*) "name"); 61 throw std::invalid_argument("WordNet " + table + " table not found");
253 g.members.insert((const char*)key);
254 xmlFree(key);
255 } 62 }
256 } 63 }
257 } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) 64
258 { 65 // Ensure CMUDICT file exists
259 for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) 66 if (!std::ifstream(cmudictPath_))
260 { 67 {
261 if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) 68 throw std::invalid_argument("CMUDICT file not found");
262 {
263 selrestr_t r;
264 r.type = selrestr_t::type_t::empty;
265
266 key = xmlGetProp(role, (const xmlChar*) "type");
267 std::string type = (const char*)key;
268 xmlFree(key);
269
270 for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
271 {
272 if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS"))
273 {
274 r = parse_selrestrs(rolenode, filename);
275 }
276 }
277
278 g.roles[type] = r;
279 }
280 } 69 }
281 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) 70
282 { 71 // Ensure ImageNet urls.txt exists
283 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) 72 if (!std::ifstream(imageNetPath_))
284 { 73 {
285 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) 74 throw std::invalid_argument("ImageNet urls.txt file not found");
286 {
287 std::list<framepart_t> f;
288
289 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
290 {
291 if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX"))
292 {
293 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
294 {
295 framepart_t fp;
296
297 if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP"))
298 {
299 fp.type = framepart_t::type_t::np;
300
301 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
302 fp.role = (const char*)key;
303 xmlFree(key);
304
305 fp.selrestrs.type = selrestr_t::type_t::empty;
306
307 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
308 {
309 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS"))
310 {
311 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
312 {
313 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR"))
314 {
315 key = xmlGetProp(synrestr, (xmlChar*) "type");
316 fp.synrestrs.insert(std::string((const char*)key));
317 xmlFree(key);
318 }
319 }
320 }
321
322 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
323 {
324 fp.selrestrs = parse_selrestrs(npnode, filename);
325 }
326 }
327 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB"))
328 {
329 fp.type = framepart_t::type_t::v;
330 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP"))
331 {
332 fp.type = framepart_t::type_t::pp;
333
334 if (xmlHasProp(syntaxnode, (xmlChar*) "value"))
335 {
336 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
337 std::string choices = (const char*)key;
338 xmlFree(key);
339
340 fp.choices = verbly::split<std::list<std::string>>(choices, " ");
341 }
342
343 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
344 {
345 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
346 {
347 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
348 {
349 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR"))
350 {
351 key = xmlGetProp(synrestr, (xmlChar*) "type");
352 fp.preprestrs.insert(std::string((const char*)key));
353 xmlFree(key);
354 }
355 }
356 }
357 }
358 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ"))
359 {
360 fp.type = framepart_t::type_t::adj;
361 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV"))
362 {
363 fp.type = framepart_t::type_t::adv;
364 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX"))
365 {
366 fp.type = framepart_t::type_t::lex;
367
368 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
369 fp.lexval = (const char*)key;
370 xmlFree(key);
371 } else {
372 continue;
373 }
374
375 f.push_back(fp);
376 }
377
378 g.frames.push_back(f);
379 }
380 }
381 }
382 } 75 }
383 } 76 }
384 }
385
386 groups[vnid] = g;
387
388 return groups[vnid];
389}
390
391int main(int argc, char** argv)
392{
393 if (argc != 7)
394 {
395 print_usage();
396 }
397
398 // VerbNet data
399 std::cout << "Reading verb frames..." << std::endl;
400
401 DIR* dir;
402 if ((dir = opendir(argv[1])) == nullptr)
403 {
404 std::cout << "Invalid VerbNet data directory." << std::endl;
405
406 print_usage();
407 }
408
409 struct dirent* ent;
410 while ((ent = readdir(dir)) != nullptr)
411 {
412 std::string filename(argv[1]);
413 if (filename.back() != '/')
414 {
415 filename += '/';
416 }
417 77
418 filename += ent->d_name; 78 void generator::run()
419 //std::cout << ent->d_name << std::endl;
420
421 if (filename.rfind(".xml") != filename.size() - 4)
422 {
423 continue;
424 }
425
426 xmlDocPtr doc = xmlParseFile(filename.c_str());
427 if (doc == nullptr)
428 {
429 std::cout << "Error opening " << filename << std::endl;
430 print_usage();
431 }
432
433 xmlNodePtr top = xmlDocGetRootElement(doc);
434 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
435 {
436 std::cout << "Bad VerbNet file format: " << filename << std::endl;
437 print_usage();
438 }
439
440 parse_group(top, filename);
441 }
442
443 closedir(dir);
444
445 // Get verbs from AGID
446 std::cout << "Reading inflections..." << std::endl;
447
448 std::ifstream agidfile(argv[2]);
449 if (!agidfile.is_open())
450 {
451 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
452 print_usage();
453 }
454
455 for (;;)
456 {
457 std::string line;
458 if (!getline(agidfile, line))
459 {
460 break;
461 }
462
463 if (line.back() == '\r')
464 { 79 {
465 line.pop_back(); 80 // Create notions, words, lemmas, and forms from WordNet synsets
466 } 81 readWordNetSynsets();
467 82
468 int divider = line.find_first_of(" "); 83 // Reads adjective positioning WordNet data
469 std::string word = line.substr(0, divider); 84 readAdjectivePositioning();
470 line = line.substr(divider+1); 85
471 char type = line[0]; 86 // Counts the number of URLs ImageNet has per notion
472 87 readImageNetUrls();
473 if (line[1] == '?') 88
474 { 89 // Creates a word by WordNet sense key lookup table
475 line.erase(0, 4); 90 readWordNetSenseKeys();
476 } else { 91
477 line.erase(0, 3); 92 // Creates groups and frames from VerbNet data
478 } 93 readVerbNet();
479 94
480 std::vector<std::string> forms; 95 // Creates forms and inflections from AGID. To reduce the amount of forms
481 while (!line.empty()) 96 // created, we do this after most lemmas that need inflecting have been
482 { 97 // created through other means, and then only generate forms for
483 std::string inflection; 98 // inflections of already-existing lemmas. The exception to this regards
484 if ((divider = line.find(" | ")) != std::string::npos) 99 // verb lemmas. If a verb lemma in AGID either does not exist yet, or does
485 { 100 // exist but is not related to any words that are related to verb notions,
486 inflection = line.substr(0, divider); 101 // then a notion and a word is generated and the form generation proceeds
487 line = line.substr(divider + 3); 102 // as usual.
488 } else { 103 readAgidInflections();
489 inflection = line; 104
490 line = ""; 105 // Reads in prepositions and the is_a relationship
491 } 106 readPrepositions();
492 107
493 if ((divider = inflection.find_first_of(",?")) != std::string::npos) 108 // Creates pronunciations from CMUDICT. To reduce the amount of
494 { 109 // pronunciations created, we do this after all forms have been created,
495 inflection = inflection.substr(0, divider); 110 // and then only generate pronunciations for already-exisiting forms.
496 } 111 readCmudictPronunciations();
497 112
498 forms.push_back(inflection); 113 // Writes the database schema
114 writeSchema();
115
116 // Dumps data to the database
117 dumpObjects();
118
119 // Populates the antonymy relationship from WordNet
120 readWordNetAntonymy();
121
122 // Populates the variation relationship from WordNet
123 readWordNetVariation();
124
125 // Populates the usage, topicality, and regionality relationships from
126 // WordNet
127 readWordNetClasses();
128
129 // Populates the causality relationship from WordNet
130 readWordNetCausality();
131
132 // Populates the entailment relationship from WordNet
133 readWordNetEntailment();
134
135 // Populates the hypernymy relationship from WordNet
136 readWordNetHypernymy();
137
138 // Populates the instantiation relationship from WordNet
139 readWordNetInstantiation();
140
141 // Populates the member meronymy relationship from WordNet
142 readWordNetMemberMeronymy();
143
144 // Populates the part meronymy relationship from WordNet
145 readWordNetPartMeronymy();
146
147 // Populates the substance meronymy relationship from WordNet
148 readWordNetSubstanceMeronymy();
149
150 // Populates the pertainymy and mannernymy relationships from WordNet
151 readWordNetPertainymy();
152
153 // Populates the specification relationship from WordNet
154 readWordNetSpecification();
155
156 // Populates the adjective similarity relationship from WordNet
157 readWordNetSimilarity();
158
159
160
161
162
163
164
165
499 } 166 }
500 167
501 switch (type) 168 void generator::readWordNetSynsets()
502 { 169 {
503 case 'V': 170 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
171 progress ppgs("Reading synsets from WordNet...", lines.size());
172
173 for (std::string line : lines)
504 { 174 {
505 verb_t v; 175 ppgs.update();
506 v.infinitive = word; 176
507 if (forms.size() == 4) 177 std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
508 { 178 std::smatch relation_data;
509 v.past_tense = forms[0]; 179 if (!std::regex_search(line, relation_data, relation))
510 v.past_participle = forms[1]; 180 {
511 v.ing_form = forms[2]; 181 continue;
512 v.s_form = forms[3];
513 } else if (forms.size() == 3)
514 {
515 v.past_tense = forms[0];
516 v.past_participle = forms[0];
517 v.ing_form = forms[1];
518 v.s_form = forms[2];
519 } else if (forms.size() == 8)
520 {
521 // As of AGID 2014.08.11, this is only "to be"
522 v.past_tense = forms[0];
523 v.past_participle = forms[2];
524 v.ing_form = forms[3];
525 v.s_form = forms[4];
526 } else {
527 // Words that don't fit the cases above as of AGID 2014.08.11:
528 // - may and shall do not conjugate the way we want them to
529 // - methinks only has a past tense and is an outlier
530 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
531 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
532 } 182 }
533 183
534 verbs[word] = v; 184 int synset_id = std::stoi(relation_data[1]);
535 185 int wnum = std::stoi(relation_data[2]);
536 break; 186 std::string text = relation_data[3];
537 } 187 int tag_count = std::stoi(relation_data[4]);
538 188 size_t word_it;
539 case 'A': 189 while ((word_it = text.find("''")) != std::string::npos)
540 {
541 adjective_t adj;
542 adj.base = word;
543 if (forms.size() == 2)
544 { 190 {
545 adj.comparative = forms[0]; 191 text.erase(word_it, 1);
546 adj.superlative = forms[1];
547 } else {
548 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
549 std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl;
550 } 192 }
551 193
552 adjectives[word] = adj; 194 // The WordNet data does contain duplicates, so we need to check that we
553 195 // haven't already created this word.
554 break; 196 std::pair<int, int> lookup(synset_id, wnum);
555 } 197 if (!wordByWnidAndWnum_.count(lookup))
556
557 case 'N':
558 {
559 noun_t n;
560 n.singular = word;
561 if (forms.size() == 1)
562 { 198 {
563 n.plural = forms[0]; 199 notion& synset = lookupOrCreateNotion(synset_id);
564 } else { 200 lemma& lex = lookupOrCreateLemma(text);
565 // As of AGID 2014.08.11, this is non-existent. 201 word& entry = createWord(synset, lex, tag_count);
566 std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; 202
203 wordByWnidAndWnum_[lookup] = &entry;
567 } 204 }
568
569 nouns[word] = n;
570
571 break;
572 } 205 }
573 } 206 }
574 }
575
576 // Pronounciations
577 std::cout << "Reading pronunciations..." << std::endl;
578
579 std::ifstream pronfile(argv[4]);
580 if (!pronfile.is_open())
581 {
582 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
583 print_usage();
584 }
585
586 for (;;)
587 {
588 std::string line;
589 if (!getline(pronfile, line))
590 {
591 break;
592 }
593
594 if (line.back() == '\r')
595 {
596 line.pop_back();
597 }
598 207
599 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); 208 void generator::readAdjectivePositioning()
600 std::smatch phoneme_data;
601 if (std::regex_search(line, phoneme_data, phoneme))
602 { 209 {
603 std::string canonical(phoneme_data[1]); 210 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
604 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); 211 progress ppgs("Reading adjective positionings from WordNet...", lines.size());
605
606 std::string phonemes = phoneme_data[2];
607 auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " ");
608 auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) {
609 return phoneme.find("1") != std::string::npos;
610 });
611 212
612 pronunciation_t p; 213 for (std::string line : lines)
613 p.phonemes = phonemes;
614
615 // Rhyme detection
616 if (phemstrt != std::end(phoneme_set))
617 { 214 {
618 std::stringstream rhymer; 215 ppgs.update();
619 for (auto it = phemstrt; it != std::end(phoneme_set); it++)
620 {
621 std::string naked;
622 std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) {
623 return isdigit(ch);
624 });
625
626 if (it != phemstrt)
627 {
628 rhymer << " ";
629 }
630
631 rhymer << naked;
632 }
633 216
634 p.rhyme = rhymer.str(); 217 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
635 218 std::smatch relation_data;
636 if (phemstrt != std::begin(phoneme_set)) 219 if (!std::regex_search(line, relation_data, relation))
637 { 220 {
638 phemstrt--; 221 continue;
639 p.prerhyme = *phemstrt;
640 } else {
641 p.prerhyme = "";
642 } 222 }
643 } else {
644 p.prerhyme = "";
645 p.rhyme = "";
646 }
647 223
648 // Syllable/stress 224 int synset_id = stoi(relation_data[1]);
649 for (auto phm : phoneme_set) 225 int wnum = stoi(relation_data[2]);
650 { 226 std::string adjpos_str = relation_data[3];
651 if (isdigit(phm.back()))
652 {
653 // It's a vowel!
654 p.syllables++;
655 227
656 if (phm.back() == '1') 228 std::pair<int, int> lookup(synset_id, wnum);
229 if (wordByWnidAndWnum_.count(lookup))
230 {
231 word& adj = *wordByWnidAndWnum_.at(lookup);
232
233 if (adjpos_str == "p")
234 {
235 adj.setAdjectivePosition(positioning::predicate);
236 } else if (adjpos_str == "a")
237 {
238 adj.setAdjectivePosition(positioning::attributive);
239 } else if (adjpos_str == "i")
657 { 240 {
658 p.stress.push_back('1'); 241 adj.setAdjectivePosition(positioning::postnominal);
659 } else { 242 } else {
660 p.stress.push_back('0'); 243 // Can't happen because of how we specified the regex.
244 assert(false);
661 } 245 }
662 } 246 }
663 } 247 }
664
665 pronunciations[canonical].insert(p);
666 }
667 }
668
669 // Images
670 std::cout << "Reading images..." << std::endl;
671
672 std::ifstream imagefile(argv[5]);
673 if (!imagefile.is_open())
674 {
675 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
676 print_usage();
677 }
678
679 for (;;)
680 {
681 std::string line;
682 if (!getline(imagefile, line))
683 {
684 break;
685 }
686
687 if (line.back() == '\r')
688 {
689 line.pop_back();
690 }
691
692 std::string wnid_s = line.substr(1, 8);
693 int wnid = stoi(wnid_s) + 100000000;
694 images[wnid]++;
695 }
696
697 imagefile.close();
698
699 // Start writing output
700 std::cout << "Writing schema..." << std::endl;
701
702 sqlite3* ppdb;
703 if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
704 {
705 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
706 print_usage();
707 }
708
709 std::ifstream schemafile("schema.sql");
710 if (!schemafile.is_open())
711 {
712 std::cout << "Could not find schema file" << std::endl;
713 print_usage();
714 }
715
716 std::stringstream schemabuilder;
717 for (;;)
718 {
719 std::string line;
720 if (!getline(schemafile, line))
721 {
722 break;
723 }
724
725 if (line.back() == '\r')
726 {
727 line.pop_back();
728 }
729
730 schemabuilder << line << std::endl;
731 }
732
733 std::string schema = schemabuilder.str();
734 while (!schema.empty())
735 {
736 std::string query;
737 int divider = schema.find(";");
738 if (divider != std::string::npos)
739 {
740 query = schema.substr(0, divider+1);
741 schema = schema.substr(divider+2);
742 } else {
743 break;
744 } 248 }
745 249
746 sqlite3_stmt* schmstmt; 250 void generator::readImageNetUrls()
747 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
748 { 251 {
749 db_error(ppdb, query); 252 // The ImageNet datafile is so large that it is unreasonable and
750 } 253 // unnecessary to read it into memory; instead, we will parse each line as
751 254 // we read it. This has the caveat that we cannot display a progress bar.
752 if (sqlite3_step(schmstmt) != SQLITE_DONE) 255 std::cout << "Reading image counts from ImageNet..." << std::endl;
753 {
754 db_error(ppdb, query);
755 }
756
757 sqlite3_finalize(schmstmt);
758 }
759
760 std::cout << "Writing prepositions..." << std::endl;
761 std::ifstream prepfile("prepositions.txt");
762 if (!prepfile.is_open())
763 {
764 std::cout << "Could not find prepositions file" << std::endl;
765 print_usage();
766 }
767
768 for (;;)
769 {
770 std::string line;
771 if (!getline(prepfile, line))
772 {
773 break;
774 }
775
776 if (line.back() == '\r')
777 {
778 line.pop_back();
779 }
780
781 std::regex relation("^([^:]+): (.+)");
782 std::smatch relation_data;
783 std::regex_search(line, relation_data, relation);
784 std::string prep = relation_data[1];
785 std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", ");
786
787 std::string query("INSERT INTO prepositions (form) VALUES (?)");
788 sqlite3_stmt* ppstmt;
789
790 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
791 {
792 db_error(ppdb, query);
793 }
794
795 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT);
796
797 if (sqlite3_step(ppstmt) != SQLITE_DONE)
798 {
799 db_error(ppdb, query);
800 }
801
802 sqlite3_finalize(ppstmt);
803
804 query = "SELECT last_insert_rowid()";
805 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
806 {
807 db_error(ppdb, query);
808 }
809
810 if (sqlite3_step(ppstmt) != SQLITE_ROW)
811 {
812 db_error(ppdb, query);
813 }
814
815 int rowid = sqlite3_column_int(ppstmt, 0);
816 sqlite3_finalize(ppstmt);
817
818 for (auto group : groups)
819 {
820 query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)";
821 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
822 {
823 db_error(ppdb, query);
824 }
825 256
826 sqlite3_bind_int(ppstmt, 1, rowid); 257 std::ifstream file(imageNetPath_);
827 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); 258 if (!file)
828
829 if (sqlite3_step(ppstmt) != SQLITE_DONE)
830 { 259 {
831 db_error(ppdb, query); 260 throw std::invalid_argument("Could not find file " + imageNetPath_);
832 } 261 }
833
834 sqlite3_finalize(ppstmt);
835 }
836 }
837
838 262
839 { 263 std::string line;
840 progress ppgs("Writing verbs...", verbs.size()); 264 while (std::getline(file, line))
841 for (auto& mapping : verbs)
842 {
843 sqlite3_stmt* ppstmt;
844 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
845 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
846 {
847 db_error(ppdb, query);
848 }
849
850 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT);
851 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT);
852 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT);
853 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT);
854 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT);
855
856 if (sqlite3_step(ppstmt) != SQLITE_DONE)
857 {
858 db_error(ppdb, query);
859 }
860
861 sqlite3_finalize(ppstmt);
862
863 std::string canonical(mapping.second.infinitive);
864 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
865 if (pronunciations.count(canonical) == 1)
866 { 265 {
867 query = "SELECT last_insert_rowid()"; 266 if (line.back() == '\r')
868 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
869 { 267 {
870 db_error(ppdb, query); 268 line.pop_back();
871 } 269 }
872 270
873 if (sqlite3_step(ppstmt) != SQLITE_ROW) 271 std::string wnid_s = line.substr(1, 8);
272 int wnid = stoi(wnid_s) + 100000000;
273 if (notionByWnid_.count(wnid))
874 { 274 {
875 db_error(ppdb, query); 275 // We know that this notion has a wnid and is a noun.
876 } 276 notionByWnid_.at(wnid)->incrementNumOfImages();
877
878 int rowid = sqlite3_column_int(ppstmt, 0);
879
880 sqlite3_finalize(ppstmt);
881
882 mapping.second.id = rowid;
883
884 for (auto pronunciation : pronunciations[canonical])
885 {
886 if (!pronunciation.rhyme.empty())
887 {
888 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
889 } else {
890 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
891 }
892
893 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
894 {
895 db_error(ppdb, query);
896 }
897
898 sqlite3_bind_int(ppstmt, 1, rowid);
899 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
900 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
901 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
902
903 if (!pronunciation.rhyme.empty())
904 {
905 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
906 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
907 }
908
909 if (sqlite3_step(ppstmt) != SQLITE_DONE)
910 {
911 db_error(ppdb, query);
912 }
913
914 sqlite3_finalize(ppstmt);
915 } 277 }
916 } 278 }
917
918 ppgs.update();
919 } 279 }
920 } 280
921 281 void generator::readWordNetSenseKeys()
922 {
923 progress ppgs("Writing verb frames...", groups.size());
924 for (auto& mapping : groups)
925 { 282 {
926 std::list<json> roledatal; 283 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
927 std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { 284 progress ppgs("Reading sense keys from WordNet...", lines.size());
928 json role;
929 role["type"] = r.first;
930 role["selrestrs"] = export_selrestrs(r.second);
931
932 return role;
933 });
934
935 json roledata(roledatal);
936 std::string rdm = roledata.dump();
937
938 sqlite3_stmt* ppstmt;
939 std::string query("INSERT INTO groups (data) VALUES (?)");
940 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
941 {
942 db_error(ppdb, query);
943 }
944
945 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT);
946
947 if (sqlite3_step(ppstmt) != SQLITE_DONE)
948 {
949 db_error(ppdb, query);
950 }
951 285
952 sqlite3_finalize(ppstmt); 286 for (std::string line : lines)
953
954 query = "SELECT last_insert_rowid()";
955 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
956 {
957 db_error(ppdb, query);
958 }
959
960 if (sqlite3_step(ppstmt) != SQLITE_ROW)
961 {
962 db_error(ppdb, query);
963 }
964
965 int gid = sqlite3_column_int(ppstmt, 0);
966 sqlite3_finalize(ppstmt);
967
968 for (auto frame : mapping.second.frames)
969 { 287 {
970 std::list<json> fdatap; 288 ppgs.update();
971 std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) {
972 json part;
973
974 switch (fp.type)
975 {
976 case framepart_t::type_t::np:
977 {
978 part["type"] = "np";
979 part["role"] = fp.role;
980 part["selrestrs"] = export_selrestrs(fp.selrestrs);
981 part["synrestrs"] = fp.synrestrs;
982
983 break;
984 }
985
986 case framepart_t::type_t::pp:
987 {
988 part["type"] = "pp";
989 part["values"] = fp.choices;
990 part["preprestrs"] = fp.preprestrs;
991
992 break;
993 }
994
995 case framepart_t::type_t::v:
996 {
997 part["type"] = "v";
998
999 break;
1000 }
1001
1002 case framepart_t::type_t::adj:
1003 {
1004 part["type"] = "adj";
1005
1006 break;
1007 }
1008
1009 case framepart_t::type_t::adv:
1010 {
1011 part["type"] = "adv";
1012
1013 break;
1014 }
1015
1016 case framepart_t::type_t::lex:
1017 {
1018 part["type"] = "lex";
1019 part["value"] = fp.lexval;
1020
1021 break;
1022 }
1023 }
1024
1025 return part;
1026 });
1027
1028 json fdata(fdatap);
1029 std::string marshall = fdata.dump();
1030
1031 query = "INSERT INTO frames (group_id, data) VALUES (?, ?)";
1032 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1033 {
1034 db_error(ppdb, query);
1035 }
1036
1037 sqlite3_bind_int(ppstmt, 1, gid);
1038 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT);
1039 289
1040 if (sqlite3_step(ppstmt) != SQLITE_DONE) 290 // We only actually need to lookup verbs by sense key so we'll just
291 // ignore everything that isn't a verb.
292 std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$");
293 std::smatch relation_data;
294 if (!std::regex_search(line, relation_data, relation))
1041 { 295 {
1042 db_error(ppdb, query); 296 continue;
1043 } 297 }
298
299 int synset_id = stoi(relation_data[1]);
300 int wnum = stoi(relation_data[2]);
301 std::string sense_key = relation_data[3];
1044 302
1045 sqlite3_finalize(ppstmt); 303 // We are treating this mapping as injective, which is not entirely
1046 } 304 // accurate. First, the WordNet table contains duplicate rows, so those
1047 305 // need to be ignored. More importantly, a small number of sense keys
1048 for (auto member : mapping.second.members) 306 // (one for each letter of the Latin alphabet, plus 9 other words) each
1049 { 307 // map to two different words in the same synset which differ only by
1050 if (verbs.count(member) == 1) 308 // capitalization. Luckily, none of these exceptions are verbs, so we
309 // can pretend that the mapping is injective.
310 if (!wnSenseKeys_.count(sense_key))
1051 { 311 {
1052 auto& v = verbs[member]; 312 std::pair<int, int> lookup(synset_id, wnum);
1053 313 if (wordByWnidAndWnum_.count(lookup))
1054 query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)";
1055 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1056 {
1057 db_error(ppdb, query);
1058 }
1059
1060 sqlite3_bind_int(ppstmt, 1, v.id);
1061 sqlite3_bind_int(ppstmt, 2, gid);
1062
1063 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1064 { 314 {
1065 db_error(ppdb, query); 315 wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup);
1066 } 316 }
1067
1068 sqlite3_finalize(ppstmt);
1069 } 317 }
1070 } 318 }
1071
1072 ppgs.update();
1073 } 319 }
1074 } 320
1075 321 void generator::readVerbNet()
1076 // Get nouns/adjectives/adverbs from WordNet
1077 // Useful relations:
1078 // - s: master list
1079 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
1080 // - at: variation (e.g. a measurement can be standard or nonstandard)
1081 // - der: derivation (e.g. happy/happily, happily/happy)
1082 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
1083 // - ins: instantiation (do we need this? let's see)
1084 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
1085 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
1086 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
1087 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
1088 // mannernymy (e.g. something done quickly is done in a manner that is quick)
1089 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
1090 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
1091 // - syntax: positioning flags for some adjectives
1092 std::string wnpref {argv[3]};
1093 if (wnpref.back() != '/')
1094 {
1095 wnpref += '/';
1096 }
1097
1098 // s table
1099 {
1100 std::ifstream wnsfile(wnpref + "wn_s.pl");
1101 if (!wnsfile.is_open())
1102 { 322 {
1103 std::cout << "Invalid WordNet data directory." << std::endl; 323 std::cout << "Reading frames from VerbNet..." << std::endl;
1104 print_usage();
1105 }
1106 324
1107 std::list<std::string> lines; 325 DIR* dir;
1108 for (;;) 326 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
1109 {
1110 std::string line;
1111 if (!getline(wnsfile, line))
1112 { 327 {
1113 break; 328 throw std::invalid_argument("Invalid VerbNet data directory");
1114 } 329 }
1115 330
1116 if (line.back() == '\r') 331 struct dirent* ent;
1117 { 332 while ((ent = readdir(dir)) != nullptr)
1118 line.pop_back();
1119 }
1120
1121 lines.push_back(line);
1122 }
1123
1124 progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size());
1125 for (auto line : lines)
1126 {
1127 ppgs.update();
1128
1129 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$");
1130 std::smatch relation_data;
1131 if (!std::regex_search(line, relation_data, relation))
1132 { 333 {
1133 continue; 334 std::string filename(verbNetPath_);
1134 } 335
336 if (filename.back() != '/')
337 {
338 filename += '/';
339 }
1135 340
1136 int synset_id = stoi(relation_data[1]); 341 filename += ent->d_name;
1137 int wnum = stoi(relation_data[2]);
1138 std::string word = relation_data[3];
1139 size_t word_it;
1140 while ((word_it = word.find("''")) != std::string::npos)
1141 {
1142 word.erase(word_it, 1);
1143 }
1144 342
1145 std::string query; 343 if (filename.rfind(".xml") != filename.size() - 4)
1146 switch (synset_id / 100000000)
1147 {
1148 case 1: // Noun
1149 { 344 {
1150 if (nouns.count(word) == 1) 345 continue;
1151 {
1152 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1153 } else {
1154 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1155 }
1156
1157 break;
1158 } 346 }
1159 347
1160 case 2: // Verb 348 xmlDocPtr doc = xmlParseFile(filename.c_str());
349 if (doc == nullptr)
1161 { 350 {
1162 // Ignore 351 throw std::logic_error("Error opening " + filename);
1163
1164 break;
1165 } 352 }
1166 353
1167 case 3: // Adjective 354 xmlNodePtr top = xmlDocGetRootElement(doc);
355 if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
1168 { 356 {
1169 if (adjectives.count(word) == 1) 357 throw std::logic_error("Bad VerbNet file format: " + filename);
1170 {
1171 query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)";
1172 } else {
1173 query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)";
1174 }
1175
1176 break;
1177 } 358 }
1178 359
1179 case 4: // Adverb 360 try
1180 { 361 {
1181 if (adjectives.count(word) == 1) 362 createGroup(top);
1182 { 363 } catch (const std::exception& e)
1183 query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; 364 {
1184 } else { 365 std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename));
1185 query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)";
1186 }
1187
1188 break;
1189 } 366 }
1190 } 367 }
368
369 closedir(dir);
370 }
1191 371
1192 sqlite3_stmt* ppstmt; 372 void generator::readAgidInflections()
1193 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) 373 {
374 std::list<std::string> lines(readFile(agidPath_));
375 progress ppgs("Reading inflections from AGID...", lines.size());
376
377 for (std::string line : lines)
1194 { 378 {
1195 db_error(ppdb, query); 379 ppgs.update();
1196 } 380
381 int divider = line.find_first_of(" ");
382 std::string infinitive = line.substr(0, divider);
383 line = line.substr(divider+1);
384 char type = line[0];
1197 385
1198 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); 386 if (line[1] == '?')
1199 switch (synset_id / 100000000)
1200 {
1201 case 1: // Noun
1202 { 387 {
1203 sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { 388 line.erase(0, 4);
1204 return isupper(ch); 389 } else {
1205 }) ? 1 : 0)); 390 line.erase(0, 3);
1206
1207 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1208 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1209 sqlite3_bind_int(ppstmt, 5, synset_id);
1210
1211 if (nouns.count(word) == 1)
1212 {
1213 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT);
1214 }
1215
1216 break;
1217 } 391 }
1218 392
1219 case 3: // Adjective 393 if (!lemmaByBaseForm_.count(infinitive) && (type != 'V'))
1220 case 4: // Adverb
1221 { 394 {
1222 sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); 395 continue;
1223 396 }
1224 if (adjectives.count(word) == 1) 397
398 lemma& curLemma = lookupOrCreateLemma(infinitive);
399
400 auto forms = split<std::vector<std::string>>(line, " | ");
401 for (std::string& inflForm : forms)
402 {
403 int sympos = inflForm.find_first_of(",?");
404 if (sympos != std::string::npos)
1225 { 405 {
1226 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); 406 inflForm = inflForm.substr(0, sympos);
1227 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT);
1228 } 407 }
1229
1230 break;
1231 } 408 }
1232 }
1233 409
1234 if (sqlite3_step(ppstmt) != SQLITE_DONE) 410 switch (type)
1235 {
1236 db_error(ppdb, query);
1237 }
1238
1239 sqlite3_finalize(ppstmt);
1240
1241 query = "SELECT last_insert_rowid()";
1242 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1243 {
1244 db_error(ppdb, query);
1245 }
1246
1247 if (sqlite3_step(ppstmt) != SQLITE_ROW)
1248 {
1249 db_error(ppdb, query);
1250 }
1251
1252 int rowid = sqlite3_column_int(ppstmt, 0);
1253 wn[synset_id][wnum] = rowid;
1254
1255 sqlite3_finalize(ppstmt);
1256
1257 std::string canonical(word);
1258 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1259 if (pronunciations.count(canonical) == 1)
1260 {
1261 for (auto pronunciation : pronunciations[canonical])
1262 { 411 {
1263 switch (synset_id / 100000000) 412 case 'V':
1264 { 413 {
1265 case 1: // Noun 414 if (forms.size() == 4)
1266 { 415 {
1267 if (!pronunciation.rhyme.empty()) 416 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1268 { 417 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1]));
1269 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 418 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2]));
1270 } else { 419 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3]));
1271 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 420 } else if (forms.size() == 3)
1272 }
1273
1274 break;
1275 }
1276
1277 case 3: // Adjective
1278 { 421 {
1279 if (!pronunciation.rhyme.empty()) 422 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1280 { 423 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0]));
1281 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 424 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1]));
1282 } else { 425 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2]));
1283 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 426 } else if (forms.size() == 8)
1284 } 427 {
1285 428 // As of AGID 2014.08.11, this is only "to be"
1286 break; 429 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
430 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2]));
431 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3]));
432 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4]));
433 } else {
434 // Words that don't fit the cases above as of AGID 2014.08.11:
435 // - may and shall do not conjugate the way we want them to
436 // - methinks only has a past tense and is an outlier
437 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
438 std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1287 } 439 }
1288 440
1289 case 4: // Adverb 441 // For verbs in particular, we sometimes create a notion and a word
442 // from inflection data. Specifically, if there are not yet any
443 // verbs existing that have the same infinitive form. "Yet" means
444 // that this verb appears in the AGID data but not in either WordNet
445 // or VerbNet.
446 if (!wordsByBaseForm_.count(infinitive)
447 || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) {
448 return w->getNotion().getPartOfSpeech() == part_of_speech::verb;
449 }))
1290 { 450 {
1291 if (!pronunciation.rhyme.empty()) 451 notion& n = createNotion(part_of_speech::verb);
1292 { 452 createWord(n, curLemma);
1293 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
1294 } else {
1295 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
1296 }
1297
1298 break;
1299 } 453 }
1300 }
1301
1302 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1303 {
1304 db_error(ppdb, query);
1305 }
1306
1307 sqlite3_bind_int(ppstmt, 1, rowid);
1308 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
1309 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
1310 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
1311
1312 if (!pronunciation.rhyme.empty())
1313 {
1314 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
1315 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
1316 }
1317 454
1318 if (sqlite3_step(ppstmt) != SQLITE_DONE) 455 break;
1319 {
1320 db_error(ppdb, query);
1321 } 456 }
1322
1323 sqlite3_finalize(ppstmt);
1324 }
1325 }
1326 }
1327 }
1328
1329 // While we're working on s
1330 {
1331 progress ppgs("Writing word synonyms...", wn.size());
1332 for (auto sense : wn)
1333 {
1334 ppgs.update();
1335 457
1336 for (auto word1 : sense.second) 458 case 'A':
1337 {
1338 for (auto word2 : sense.second)
1339 {
1340 if (word1 != word2)
1341 { 459 {
1342 std::string query; 460 if (forms.size() == 2)
1343 switch (sense.first / 100000000)
1344 { 461 {
1345 case 1: // Noun 462 curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0]));
1346 { 463 curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1]));
1347 query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; 464 } else {
1348 465 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
1349 break; 466 std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1350 } 467 }
1351
1352 case 2: // Verb
1353 {
1354 // Ignore
1355
1356 break;
1357 }
1358
1359 case 3: // Adjective
1360 {
1361 query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1362 468
1363 break; 469 break;
1364 } 470 }
1365 471
1366 case 4: // Adverb 472 case 'N':
1367 { 473 {
1368 query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; 474 if (forms.size() == 1)
1369
1370 break;
1371 }
1372 }
1373
1374 sqlite3_stmt* ppstmt;
1375 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1376 {
1377 db_error(ppdb, query);
1378 }
1379
1380 sqlite3_bind_int(ppstmt, 1, word1.second);
1381 sqlite3_bind_int(ppstmt, 2, word2.second);
1382
1383 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1384 { 475 {
1385 db_error(ppdb, query); 476 curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0]));
477 } else {
478 // As of AGID 2014.08.11, this is non-existent.
479 std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1386 } 480 }
1387 481
1388 sqlite3_finalize(ppstmt); 482 break;
1389 } 483 }
1390 } 484 }
1391 } 485 }
1392 } 486 }
1393 }
1394
1395 // ant table
1396 {
1397 std::ifstream wnantfile(wnpref + "wn_ant.pl");
1398 if (!wnantfile.is_open())
1399 {
1400 std::cout << "Invalid WordNet data directory." << std::endl;
1401 print_usage();
1402 }
1403
1404 std::list<std::string> lines;
1405 for (;;)
1406 {
1407 std::string line;
1408 if (!getline(wnantfile, line))
1409 {
1410 break;
1411 }
1412 487
1413 if (line.back() == '\r') 488 void generator::readPrepositions()
1414 {
1415 line.pop_back();
1416 }
1417
1418 lines.push_back(line);
1419 }
1420
1421 progress ppgs("Writing antonyms...", lines.size());
1422 for (auto line : lines)
1423 { 489 {
1424 ppgs.update(); 490 std::list<std::string> lines(readFile("prepositions.txt"));
491 progress ppgs("Reading prepositions...", lines.size());
1425 492
1426 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); 493 for (std::string line : lines)
1427 std::smatch relation_data;
1428 if (!std::regex_search(line, relation_data, relation))
1429 {
1430 continue;
1431 }
1432
1433 int synset_id_1 = stoi(relation_data[1]);
1434 int wnum_1 = stoi(relation_data[2]);
1435 int synset_id_2 = stoi(relation_data[3]);
1436 int wnum_2 = stoi(relation_data[4]);
1437
1438 std::string query;
1439 switch (synset_id_1 / 100000000)
1440 { 494 {
1441 case 1: // Noun 495 ppgs.update();
1442 {
1443 query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
1444 496
1445 break; 497 std::regex relation("^([^:]+): (.+)");
1446 } 498 std::smatch relation_data;
1447 499 std::regex_search(line, relation_data, relation);
1448 case 2: // Verb 500 std::string prep = relation_data[1];
1449 { 501 auto groups = split<std::list<std::string>>(relation_data[2], ", ");
1450 // Ignore
1451 502
1452 break; 503 notion& n = createNotion(part_of_speech::preposition);
1453 } 504 lemma& l = lookupOrCreateLemma(prep);
1454 505 word& w = createWord(n, l);
1455 case 3: // Adjective
1456 {
1457 query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1458 506
1459 break; 507 n.setPrepositionGroups(groups);
1460 }
1461
1462 case 4: // Adverb
1463 {
1464 query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1465
1466 break;
1467 }
1468 }
1469
1470 sqlite3_stmt* ppstmt;
1471 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1472 {
1473 db_error(ppdb, query);
1474 }
1475
1476 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1477 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1478
1479 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1480 {
1481 db_error(ppdb, query);
1482 }
1483
1484 sqlite3_finalize(ppstmt);
1485 }
1486 }
1487
1488 // at table
1489 {
1490 std::ifstream wnatfile(wnpref + "wn_at.pl");
1491 if (!wnatfile.is_open())
1492 {
1493 std::cout << "Invalid WordNet data directory." << std::endl;
1494 print_usage();
1495 }
1496
1497 std::list<std::string> lines;
1498 for (;;)
1499 {
1500 std::string line;
1501 if (!getline(wnatfile, line))
1502 {
1503 break;
1504 } 508 }
1505
1506 if (line.back() == '\r')
1507 {
1508 line.pop_back();
1509 }
1510
1511 lines.push_back(line);
1512 } 509 }
1513 510
1514 progress ppgs("Writing variations...", lines.size()); 511 void generator::readCmudictPronunciations()
1515 for (auto line : lines)
1516 { 512 {
1517 ppgs.update(); 513 std::list<std::string> lines(readFile(cmudictPath_));
514 progress ppgs("Reading pronunciations from CMUDICT...", lines.size());
1518 515
1519 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); 516 for (std::string line : lines)
1520 std::smatch relation_data;
1521 if (!std::regex_search(line, relation_data, relation))
1522 { 517 {
1523 continue; 518 ppgs.update();
1524 } 519
1525 520 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
1526 int synset_id_1 = stoi(relation_data[1]); 521 std::smatch phoneme_data;
1527 int synset_id_2 = stoi(relation_data[2]); 522 if (std::regex_search(line, phoneme_data, phoneme))
1528 std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)");
1529
1530 for (auto mapping1 : wn[synset_id_1])
1531 {
1532 for (auto mapping2 : wn[synset_id_2])
1533 { 523 {
1534 sqlite3_stmt* ppstmt; 524 std::string canonical(phoneme_data[1]);
1535 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 525 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1536 {
1537 db_error(ppdb, query);
1538 }
1539
1540 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1541 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1542 526
1543 if (sqlite3_step(ppstmt) != SQLITE_DONE) 527 if (!formByText_.count(canonical))
1544 { 528 {
1545 db_error(ppdb, query); 529 continue;
1546 } 530 }
1547 531
1548 sqlite3_finalize(ppstmt); 532 std::string phonemes = phoneme_data[2];
533 pronunciations_.emplace_back(phonemes);
534 pronunciation& p = pronunciations_.back();
535 formByText_.at(canonical)->addPronunciation(p);
1549 } 536 }
1550 } 537 }
1551 } 538 }
1552 }
1553
1554 // der table
1555 {
1556 std::ifstream wnderfile(wnpref + "wn_der.pl");
1557 if (!wnderfile.is_open())
1558 {
1559 std::cout << "Invalid WordNet data directory." << std::endl;
1560 print_usage();
1561 }
1562 539
1563 std::list<std::string> lines; 540 void generator::writeSchema()
1564 for (;;)
1565 { 541 {
1566 std::string line; 542 std::ifstream file("schema.sql");
1567 if (!getline(wnderfile, line)) 543 if (!file)
1568 { 544 {
1569 break; 545 throw std::invalid_argument("Could not find database schema");
1570 } 546 }
1571 547
1572 if (line.back() == '\r') 548 std::ostringstream schemaBuilder;
549 std::string line;
550 while (std::getline(file, line))
1573 { 551 {
1574 line.pop_back(); 552 if (line.back() == '\r')
553 {
554 line.pop_back();
555 }
556
557 schemaBuilder << line;
1575 } 558 }
1576 559
1577 lines.push_back(line); 560 std::string schema = schemaBuilder.str();
561 auto queries = split<std::list<std::string>>(schema, ";");
562 progress ppgs("Writing database schema...", queries.size());
563 for (std::string query : queries)
564 {
565 if (!queries.empty())
566 {
567 db_.runQuery(query);
568 }
569
570 ppgs.update();
571 }
1578 } 572 }
1579 573
1580 progress ppgs("Writing morphological derivation...", lines.size()); 574 void generator::dumpObjects()
1581 for (auto line : lines)
1582 { 575 {
1583 ppgs.update();
1584
1585 std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1586 std::smatch relation_data;
1587 if (!std::regex_search(line, relation_data, relation))
1588 { 576 {
1589 continue; 577 progress ppgs("Writing notions...", notions_.size());
578
579 for (notion& n : notions_)
580 {
581 db_ << n;
582
583 ppgs.update();
584 }
1590 } 585 }
1591 586
1592 int synset_id_1 = stoi(relation_data[1]);
1593 int wnum_1 = stoi(relation_data[2]);
1594 int synset_id_2 = stoi(relation_data[3]);
1595 int wnum_2 = stoi(relation_data[4]);
1596 std::string query;
1597 switch (synset_id_1 / 100000000)
1598 { 587 {
1599 case 1: // Noun 588 progress ppgs("Writing words...", words_.size());
589
590 for (word& w : words_)
1600 { 591 {
1601 switch (synset_id_2 / 100000000) 592 db_ << w;
1602 {
1603 case 1: // Noun
1604 {
1605 query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)";
1606 break;
1607 }
1608
1609 case 3: // Adjective
1610 {
1611 query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)";
1612 break;
1613 }
1614
1615 case 4: // Adverb
1616 {
1617 query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)";
1618 break;
1619 }
1620 }
1621 593
1622 break; 594 ppgs.update();
1623 } 595 }
596 }
597
598 {
599 progress ppgs("Writing lemmas...", lemmas_.size());
1624 600
1625 case 3: // Adjective 601 for (lemma& l : lemmas_)
1626 { 602 {
1627 switch (synset_id_2 / 100000000) 603 db_ << l;
1628 {
1629 case 1: // Noun
1630 {
1631 query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)";
1632 break;
1633 }
1634
1635 case 3: // Adjective
1636 {
1637 query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)";
1638 break;
1639 }
1640
1641 case 4: // Adverb
1642 {
1643 query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)";
1644 break;
1645 }
1646 }
1647 604
1648 break; 605 ppgs.update();
1649 } 606 }
607 }
608
609 {
610 progress ppgs("Writing forms...", forms_.size());
1650 611
1651 case 4: // Adverb 612 for (form& f : forms_)
1652 { 613 {
1653 switch (synset_id_2 / 100000000) 614 db_ << f;
1654 {
1655 case 1: // Noun
1656 {
1657 query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)";
1658 break;
1659 }
1660
1661 case 3: // Adjective
1662 {
1663 query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)";
1664 break;
1665 }
1666
1667 case 4: // Adverb
1668 {
1669 query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1670 break;
1671 }
1672 }
1673 615
1674 break; 616 ppgs.update();
1675 } 617 }
1676 } 618 }
1677 619
1678 sqlite3_stmt* ppstmt;
1679 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1680 { 620 {
1681 db_error(ppdb, query); 621 progress ppgs("Writing pronunciations...", pronunciations_.size());
622
623 for (pronunciation& p : pronunciations_)
624 {
625 db_ << p;
626
627 ppgs.update();
628 }
1682 } 629 }
1683 630
1684 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1685 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1686
1687 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1688 { 631 {
1689 db_error(ppdb, query); 632 progress ppgs("Writing verb groups...", groups_.size());
633
634 for (group& g : groups_)
635 {
636 db_ << g;
637
638 ppgs.update();
639 }
1690 } 640 }
1691 641
1692 sqlite3_finalize(ppstmt);
1693 }
1694 }
1695
1696 // hyp table
1697 {
1698 std::ifstream wnhypfile(wnpref + "wn_hyp.pl");
1699 if (!wnhypfile.is_open())
1700 {
1701 std::cout << "Invalid WordNet data directory." << std::endl;
1702 print_usage();
1703 }
1704
1705 std::list<std::string> lines;
1706 for (;;)
1707 {
1708 std::string line;
1709 if (!getline(wnhypfile, line))
1710 {
1711 break;
1712 }
1713
1714 if (line.back() == '\r')
1715 { 642 {
1716 line.pop_back(); 643 progress ppgs("Writing verb frames...", frames_.size());
644
645 for (frame& f : frames_)
646 {
647 db_ << f;
648
649 ppgs.update();
650 }
1717 } 651 }
1718
1719 lines.push_back(line);
1720 } 652 }
1721 653
1722 progress ppgs("Writing hypernyms...", lines.size()); 654 void generator::readWordNetAntonymy()
1723 for (auto line : lines)
1724 { 655 {
1725 ppgs.update(); 656 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
1726 657 progress ppgs("Writing antonyms...", lines.size());
1727 std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); 658 for (auto line : lines)
1728 std::smatch relation_data;
1729 if (!std::regex_search(line, relation_data, relation))
1730 { 659 {
1731 continue; 660 ppgs.update();
1732 }
1733
1734 int synset_id_1 = stoi(relation_data[1]);
1735 int synset_id_2 = stoi(relation_data[2]);
1736 std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)");
1737 661
1738 for (auto mapping1 : wn[synset_id_1]) 662 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1739 { 663 std::smatch relation_data;
1740 for (auto mapping2 : wn[synset_id_2]) 664 if (!std::regex_search(line, relation_data, relation))
1741 { 665 {
1742 sqlite3_stmt* ppstmt; 666 continue;
1743 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 667 }
1744 { 668
1745 db_error(ppdb, query); 669 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1746 } 670 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1747 671
1748 sqlite3_bind_int(ppstmt, 1, mapping1.second); 672 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1749 sqlite3_bind_int(ppstmt, 2, mapping2.second); 673 {
674 word& word1 = *wordByWnidAndWnum_.at(lookup1);
675 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1750 676
1751 if (sqlite3_step(ppstmt) != SQLITE_DONE) 677 std::list<field> fields;
1752 { 678 fields.emplace_back("antonym_1_id", word1.getId());
1753 db_error(ppdb, query); 679 fields.emplace_back("antonym_2_id", word2.getId());
1754 }
1755 680
1756 sqlite3_finalize(ppstmt); 681 db_.insertIntoTable("antonymy", std::move(fields));
1757 } 682 }
1758 } 683 }
1759 } 684 }
1760 }
1761
1762 // ins table
1763 {
1764 std::ifstream wninsfile(wnpref + "wn_ins.pl");
1765 if (!wninsfile.is_open())
1766 {
1767 std::cout << "Invalid WordNet data directory." << std::endl;
1768 print_usage();
1769 }
1770
1771 std::list<std::string> lines;
1772 for (;;)
1773 {
1774 std::string line;
1775 if (!getline(wninsfile, line))
1776 {
1777 break;
1778 }
1779 685
1780 if (line.back() == '\r') 686 void generator::readWordNetVariation()
687 {
688 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
689 progress ppgs("Writing variation...", lines.size());
690 for (auto line : lines)
1781 { 691 {
1782 line.pop_back(); 692 ppgs.update();
1783 }
1784 693
1785 lines.push_back(line); 694 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
695 std::smatch relation_data;
696 if (!std::regex_search(line, relation_data, relation))
697 {
698 continue;
699 }
700
701 int lookup1 = std::stoi(relation_data[1]);
702 int lookup2 = std::stoi(relation_data[2]);
703
704 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
705 {
706 notion& notion1 = *notionByWnid_.at(lookup1);
707 notion& notion2 = *notionByWnid_.at(lookup2);
708
709 std::list<field> fields;
710 fields.emplace_back("noun_id", notion1.getId());
711 fields.emplace_back("adjective_id", notion2.getId());
712
713 db_.insertIntoTable("variation", std::move(fields));
714 }
715 }
1786 } 716 }
1787 717
1788 progress ppgs("Writing instantiations...", lines.size()); 718 void generator::readWordNetClasses()
1789 for (auto line : lines)
1790 { 719 {
1791 ppgs.update(); 720 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
1792 721 progress ppgs("Writing usage, topicality, and regionality...", lines.size());
1793 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); 722 for (auto line : lines)
1794 std::smatch relation_data;
1795 if (!std::regex_search(line, relation_data, relation))
1796 { 723 {
1797 continue; 724 ppgs.update();
1798 }
1799
1800 int synset_id_1 = stoi(relation_data[1]);
1801 int synset_id_2 = stoi(relation_data[2]);
1802 std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)");
1803 725
1804 for (auto mapping1 : wn[synset_id_1]) 726 std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
1805 { 727 std::smatch relation_data;
1806 for (auto mapping2 : wn[synset_id_2]) 728 if (!std::regex_search(line, relation_data, relation))
729 {
730 continue;
731 }
732
733 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
734 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
735 std::string class_type = relation_data[5];
736
737 std::string table_name;
738 if (class_type == "t")
739 {
740 table_name += "topicality";
741 } else if (class_type == "u")
742 {
743 table_name += "usage";
744 } else if (class_type == "r")
745 {
746 table_name += "regionality";
747 }
748
749 std::list<int> leftJoin;
750 std::list<int> rightJoin;
751
752 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
1807 { 753 {
1808 sqlite3_stmt* ppstmt; 754 std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) {
1809 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 755 return w->getId();
756 });
757 } else if (wordByWnidAndWnum_.count(lookup1)) {
758 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
759 }
760
761 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
762 {
763 std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) {
764 return w->getId();
765 });
766 } else if (wordByWnidAndWnum_.count(lookup2)) {
767 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
768 }
769
770 for (int word1 : leftJoin)
771 {
772 for (int word2 : rightJoin)
1810 { 773 {
1811 db_error(ppdb, query); 774 std::list<field> fields;
1812 } 775 fields.emplace_back("term_id", word1);
776 fields.emplace_back("domain_id", word2);
1813 777
1814 sqlite3_bind_int(ppstmt, 1, mapping1.second); 778 db_.insertIntoTable(table_name, std::move(fields));
1815 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1816
1817 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1818 {
1819 db_error(ppdb, query);
1820 } 779 }
1821
1822 sqlite3_finalize(ppstmt);
1823 } 780 }
1824 } 781 }
1825 } 782 }
1826 }
1827
1828 // mm table
1829 {
1830 std::ifstream wnmmfile(wnpref + "wn_mm.pl");
1831 if (!wnmmfile.is_open())
1832 {
1833 std::cout << "Invalid WordNet data directory." << std::endl;
1834 print_usage();
1835 }
1836
1837 std::list<std::string> lines;
1838 for (;;)
1839 {
1840 std::string line;
1841 if (!getline(wnmmfile, line))
1842 {
1843 break;
1844 }
1845 783
1846 if (line.back() == '\r') 784 void generator::readWordNetCausality()
785 {
786 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
787 progress ppgs("Writing causality...", lines.size());
788 for (auto line : lines)
1847 { 789 {
1848 line.pop_back(); 790 ppgs.update();
1849 }
1850 791
1851 lines.push_back(line); 792 std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\.");
793 std::smatch relation_data;
794 if (!std::regex_search(line, relation_data, relation))
795 {
796 continue;
797 }
798
799 int lookup1 = std::stoi(relation_data[1]);
800 int lookup2 = std::stoi(relation_data[2]);
801
802 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
803 {
804 notion& notion1 = *notionByWnid_.at(lookup1);
805 notion& notion2 = *notionByWnid_.at(lookup2);
806
807 std::list<field> fields;
808 fields.emplace_back("effect_id", notion1.getId());
809 fields.emplace_back("cause_id", notion2.getId());
810
811 db_.insertIntoTable("causality", std::move(fields));
812 }
813 }
1852 } 814 }
1853 815
1854 progress ppgs("Writing member meronyms...", lines.size()); 816 void generator::readWordNetEntailment()
1855 for (auto line : lines)
1856 { 817 {
1857 ppgs.update(); 818 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
1858 819 progress ppgs("Writing entailment...", lines.size());
1859 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); 820 for (auto line : lines)
1860 std::smatch relation_data;
1861 if (!std::regex_search(line, relation_data, relation))
1862 { 821 {
1863 continue; 822 ppgs.update();
1864 }
1865 823
1866 int synset_id_1 = stoi(relation_data[1]); 824 std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\.");
1867 int synset_id_2 = stoi(relation_data[2]); 825 std::smatch relation_data;
1868 std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); 826 if (!std::regex_search(line, relation_data, relation))
1869
1870 for (auto mapping1 : wn[synset_id_1])
1871 {
1872 for (auto mapping2 : wn[synset_id_2])
1873 { 827 {
1874 sqlite3_stmt* ppstmt; 828 continue;
1875 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 829 }
1876 { 830
1877 db_error(ppdb, query); 831 int lookup1 = std::stoi(relation_data[1]);
1878 } 832 int lookup2 = std::stoi(relation_data[2]);
1879 833
1880 sqlite3_bind_int(ppstmt, 1, mapping1.second); 834 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
1881 sqlite3_bind_int(ppstmt, 2, mapping2.second); 835 {
836 notion& notion1 = *notionByWnid_.at(lookup1);
837 notion& notion2 = *notionByWnid_.at(lookup2);
1882 838
1883 if (sqlite3_step(ppstmt) != SQLITE_DONE) 839 std::list<field> fields;
1884 { 840 fields.emplace_back("given_id", notion1.getId());
1885 db_error(ppdb, query); 841 fields.emplace_back("entailment_id", notion2.getId());
1886 }
1887 842
1888 sqlite3_finalize(ppstmt); 843 db_.insertIntoTable("entailment", std::move(fields));
1889 } 844 }
1890 } 845 }
1891 } 846 }
1892 } 847
1893 848 void generator::readWordNetHypernymy()
1894 // ms table
1895 {
1896 std::ifstream wnmsfile(wnpref + "wn_ms.pl");
1897 if (!wnmsfile.is_open())
1898 {
1899 std::cout << "Invalid WordNet data directory." << std::endl;
1900 print_usage();
1901 }
1902
1903 std::list<std::string> lines;
1904 for (;;)
1905 { 849 {
1906 std::string line; 850 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
1907 if (!getline(wnmsfile, line)) 851 progress ppgs("Writing hypernymy...", lines.size());
852 for (auto line : lines)
1908 { 853 {
1909 break; 854 ppgs.update();
855
856 std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\.");
857 std::smatch relation_data;
858 if (!std::regex_search(line, relation_data, relation))
859 {
860 continue;
861 }
862
863 int lookup1 = std::stoi(relation_data[1]);
864 int lookup2 = std::stoi(relation_data[2]);
865
866 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
867 {
868 notion& notion1 = *notionByWnid_.at(lookup1);
869 notion& notion2 = *notionByWnid_.at(lookup2);
870
871 std::list<field> fields;
872 fields.emplace_back("hyponym_id", notion1.getId());
873 fields.emplace_back("hypernym_id", notion2.getId());
874
875 db_.insertIntoTable("hypernymy", std::move(fields));
876 }
1910 } 877 }
878 }
1911 879
1912 if (line.back() == '\r') 880 void generator::readWordNetInstantiation()
881 {
882 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
883 progress ppgs("Writing instantiation...", lines.size());
884 for (auto line : lines)
1913 { 885 {
1914 line.pop_back(); 886 ppgs.update();
1915 }
1916 887
1917 lines.push_back(line); 888 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
889 std::smatch relation_data;
890 if (!std::regex_search(line, relation_data, relation))
891 {
892 continue;
893 }
894
895 int lookup1 = std::stoi(relation_data[1]);
896 int lookup2 = std::stoi(relation_data[2]);
897
898 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
899 {
900 notion& notion1 = *notionByWnid_.at(lookup1);
901 notion& notion2 = *notionByWnid_.at(lookup2);
902
903 std::list<field> fields;
904 fields.emplace_back("instance_id", notion1.getId());
905 fields.emplace_back("class_id", notion2.getId());
906
907 db_.insertIntoTable("instantiation", std::move(fields));
908 }
909 }
1918 } 910 }
1919 911
1920 progress ppgs("Writing substance meronyms...", lines.size()); 912 void generator::readWordNetMemberMeronymy()
1921 for (auto line : lines)
1922 { 913 {
1923 ppgs.update(); 914 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
1924 915 progress ppgs("Writing member meronymy...", lines.size());
1925 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); 916 for (auto line : lines)
1926 std::smatch relation_data;
1927 if (!std::regex_search(line, relation_data, relation))
1928 { 917 {
1929 continue; 918 ppgs.update();
1930 }
1931
1932 int synset_id_1 = stoi(relation_data[1]);
1933 int synset_id_2 = stoi(relation_data[2]);
1934 std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1935 919
1936 for (auto mapping1 : wn[synset_id_1]) 920 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
1937 { 921 std::smatch relation_data;
1938 for (auto mapping2 : wn[synset_id_2]) 922 if (!std::regex_search(line, relation_data, relation))
1939 { 923 {
1940 sqlite3_stmt* ppstmt; 924 continue;
1941 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 925 }
1942 { 926
1943 db_error(ppdb, query); 927 int lookup1 = std::stoi(relation_data[1]);
1944 } 928 int lookup2 = std::stoi(relation_data[2]);
929
930 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
931 {
932 notion& notion1 = *notionByWnid_.at(lookup1);
933 notion& notion2 = *notionByWnid_.at(lookup2);
1945 934
1946 sqlite3_bind_int(ppstmt, 1, mapping1.second); 935 std::list<field> fields;
1947 sqlite3_bind_int(ppstmt, 2, mapping2.second); 936 fields.emplace_back("holonym_id", notion1.getId());
937 fields.emplace_back("meronym_id", notion2.getId());
1948 938
1949 if (sqlite3_step(ppstmt) != SQLITE_DONE) 939 db_.insertIntoTable("member_meronymy", std::move(fields));
1950 {
1951 db_error(ppdb, query);
1952 }
1953
1954 sqlite3_finalize(ppstmt);
1955 } 940 }
1956 } 941 }
1957 } 942 }
1958 } 943
1959 944 void generator::readWordNetPartMeronymy()
1960 // mm table
1961 {
1962 std::ifstream wnmpfile(wnpref + "wn_mp.pl");
1963 if (!wnmpfile.is_open())
1964 {
1965 std::cout << "Invalid WordNet data directory." << std::endl;
1966 print_usage();
1967 }
1968
1969 std::list<std::string> lines;
1970 for (;;)
1971 { 945 {
1972 std::string line; 946 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
1973 if (!getline(wnmpfile, line)) 947 progress ppgs("Writing part meronymy...", lines.size());
948 for (auto line : lines)
1974 { 949 {
1975 break; 950 ppgs.update();
951
952 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
953 std::smatch relation_data;
954 if (!std::regex_search(line, relation_data, relation))
955 {
956 continue;
957 }
958
959 int lookup1 = std::stoi(relation_data[1]);
960 int lookup2 = std::stoi(relation_data[2]);
961
962 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
963 {
964 notion& notion1 = *notionByWnid_.at(lookup1);
965 notion& notion2 = *notionByWnid_.at(lookup2);
966
967 std::list<field> fields;
968 fields.emplace_back("holonym_id", notion1.getId());
969 fields.emplace_back("meronym_id", notion2.getId());
970
971 db_.insertIntoTable("part_meronymy", std::move(fields));
972 }
1976 } 973 }
974 }
1977 975
1978 if (line.back() == '\r') 976 void generator::readWordNetSubstanceMeronymy()
977 {
978 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
979 progress ppgs("Writing substance meronymy...", lines.size());
980 for (auto line : lines)
1979 { 981 {
1980 line.pop_back(); 982 ppgs.update();
1981 }
1982 983
1983 lines.push_back(line); 984 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
985 std::smatch relation_data;
986 if (!std::regex_search(line, relation_data, relation))
987 {
988 continue;
989 }
990
991 int lookup1 = std::stoi(relation_data[1]);
992 int lookup2 = std::stoi(relation_data[2]);
993
994 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
995 {
996 notion& notion1 = *notionByWnid_.at(lookup1);
997 notion& notion2 = *notionByWnid_.at(lookup2);
998
999 std::list<field> fields;
1000 fields.emplace_back("holonym_id", notion1.getId());
1001 fields.emplace_back("meronym_id", notion2.getId());
1002
1003 db_.insertIntoTable("substance_meronymy", std::move(fields));
1004 }
1005 }
1984 } 1006 }
1985 1007
1986 progress ppgs("Writing part meronyms...", lines.size()); 1008 void generator::readWordNetPertainymy()
1987 for (auto line : lines)
1988 { 1009 {
1989 ppgs.update(); 1010 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
1990 1011 progress ppgs("Writing pertainymy and mannernymy...", lines.size());
1991 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); 1012 for (auto line : lines)
1992 std::smatch relation_data;
1993 if (!std::regex_search(line, relation_data, relation))
1994 { 1013 {
1995 continue; 1014 ppgs.update();
1996 }
1997
1998 int synset_id_1 = stoi(relation_data[1]);
1999 int synset_id_2 = stoi(relation_data[2]);
2000 std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
2001 1015
2002 for (auto mapping1 : wn[synset_id_1]) 1016 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
2003 { 1017 std::smatch relation_data;
2004 for (auto mapping2 : wn[synset_id_2]) 1018 if (!std::regex_search(line, relation_data, relation))
2005 { 1019 {
2006 sqlite3_stmt* ppstmt; 1020 continue;
2007 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 1021 }
2008 { 1022
2009 db_error(ppdb, query); 1023 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
2010 } 1024 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1025
1026 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1027 {
1028 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1029 word& word2 = *wordByWnidAndWnum_.at(lookup2);
2011 1030
2012 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1031 if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective)
2013 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1032 {
1033 std::list<field> fields;
1034 fields.emplace_back("pertainym_id", word1.getId());
1035 fields.emplace_back("noun_id", word2.getId());
2014 1036
2015 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1037 db_.insertIntoTable("pertainymy", std::move(fields));
1038 } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb)
2016 { 1039 {
2017 db_error(ppdb, query); 1040 std::list<field> fields;
2018 } 1041 fields.emplace_back("mannernym_id", word1.getId());
1042 fields.emplace_back("adjective_id", word2.getId());
2019 1043
2020 sqlite3_finalize(ppstmt); 1044 db_.insertIntoTable("mannernymy", std::move(fields));
1045 }
2021 } 1046 }
2022 } 1047 }
2023 } 1048 }
2024 }
2025
2026 // per table
2027 {
2028 std::ifstream wnperfile(wnpref + "wn_per.pl");
2029 if (!wnperfile.is_open())
2030 {
2031 std::cout << "Invalid WordNet data directory." << std::endl;
2032 print_usage();
2033 }
2034
2035 std::list<std::string> lines;
2036 for (;;)
2037 {
2038 std::string line;
2039 if (!getline(wnperfile, line))
2040 {
2041 break;
2042 }
2043 1049
2044 if (line.back() == '\r') 1050 void generator::readWordNetSpecification()
1051 {
1052 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
1053 progress ppgs("Writing specifications...", lines.size());
1054 for (auto line : lines)
2045 { 1055 {
2046 line.pop_back(); 1056 ppgs.update();
1057
1058 std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\.");
1059 std::smatch relation_data;
1060 if (!std::regex_search(line, relation_data, relation))
1061 {
1062 continue;
1063 }
1064
1065 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1066 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1067
1068 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1069 {
1070 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1071 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1072
1073 std::list<field> fields;
1074 fields.emplace_back("general_id", word1.getId());
1075 fields.emplace_back("specific_id", word2.getId());
1076
1077 db_.insertIntoTable("specification", std::move(fields));
1078 }
2047 } 1079 }
2048
2049 lines.push_back(line);
2050 } 1080 }
2051 1081
2052 progress ppgs("Writing pertainyms and mannernyms...", lines.size()); 1082 void generator::readWordNetSimilarity()
2053 for (auto line : lines)
2054 { 1083 {
2055 ppgs.update(); 1084 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
2056 1085 progress ppgs("Writing adjective similarity...", lines.size());
2057 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); 1086 for (auto line : lines)
2058 std::smatch relation_data;
2059 if (!std::regex_search(line, relation_data, relation))
2060 { 1087 {
2061 continue; 1088 ppgs.update();
2062 }
2063 1089
2064 int synset_id_1 = stoi(relation_data[1]); 1090 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
2065 int wnum_1 = stoi(relation_data[2]); 1091 std::smatch relation_data;
2066 int synset_id_2 = stoi(relation_data[3]); 1092 if (!std::regex_search(line, relation_data, relation))
2067 int wnum_2 = stoi(relation_data[4]);
2068 std::string query;
2069 switch (synset_id_1 / 100000000)
2070 {
2071 case 3: // Adjective
2072 { 1093 {
2073 // This is a pertainym, the second word should be a noun 1094 continue;
2074 // Technically it can be an adjective but we're ignoring that
2075 if (synset_id_2 / 100000000 != 1)
2076 {
2077 continue;
2078 }
2079
2080 query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)";
2081
2082 break;
2083 } 1095 }
1096
1097 int lookup1 = std::stoi(relation_data[1]);
1098 int lookup2 = std::stoi(relation_data[2]);
2084 1099
2085 case 4: // Adverb 1100 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
2086 { 1101 {
2087 // This is a mannernym, the second word should be an adjective 1102 notion& notion1 = *notionByWnid_.at(lookup1);
2088 if (synset_id_2 / 100000000 != 3) 1103 notion& notion2 = *notionByWnid_.at(lookup2);
2089 {
2090 continue;
2091 }
2092 1104
2093 query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; 1105 std::list<field> fields;
1106 fields.emplace_back("adjective_1_id", notion1.getId());
1107 fields.emplace_back("adjective_2_id", notion2.getId());
2094 1108
2095 break; 1109 db_.insertIntoTable("similarity", std::move(fields));
2096 } 1110 }
2097 } 1111 }
2098 1112 }
2099 sqlite3_stmt* ppstmt;
2100 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2101 {
2102 db_error(ppdb, query);
2103 }
2104
2105 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
2106 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
2107 1113
2108 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1114 std::list<std::string> generator::readFile(std::string path)
1115 {
1116 std::ifstream file(path);
1117 if (!file)
2109 { 1118 {
2110 db_error(ppdb, query); 1119 throw std::invalid_argument("Could not find file " + path);
2111 } 1120 }
2112
2113 sqlite3_finalize(ppstmt);
2114 }
2115 }
2116 1121
2117 // sa table 1122 std::list<std::string> lines;
2118 {
2119 std::ifstream wnsafile(wnpref + "wn_sa.pl");
2120 if (!wnsafile.is_open())
2121 {
2122 std::cout << "Invalid WordNet data directory." << std::endl;
2123 print_usage();
2124 }
2125
2126 std::list<std::string> lines;
2127 for (;;)
2128 {
2129 std::string line; 1123 std::string line;
2130 if (!getline(wnsafile, line)) 1124 while (std::getline(file, line))
2131 {
2132 break;
2133 }
2134
2135 if (line.back() == '\r')
2136 { 1125 {
2137 line.pop_back(); 1126 if (line.back() == '\r')
1127 {
1128 line.pop_back();
1129 }
1130
1131 lines.push_back(line);
2138 } 1132 }
2139 1133
2140 lines.push_back(line); 1134 return lines;
2141 } 1135 }
2142 1136
2143 progress ppgs("Writing specifications...", lines.size()); 1137 part_of_speech generator::partOfSpeechByWnid(int wnid)
2144 for (auto line : lines)
2145 { 1138 {
2146 ppgs.update(); 1139 switch (wnid / 100000000)
2147
2148 std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\.");
2149 std::smatch relation_data;
2150 if (!std::regex_search(line, relation_data, relation))
2151 {
2152 continue;
2153 }
2154
2155 int synset_id_1 = stoi(relation_data[1]);
2156 int wnum_1 = stoi(relation_data[2]);
2157 int synset_id_2 = stoi(relation_data[3]);
2158 int wnum_2 = stoi(relation_data[4]);
2159 std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)");
2160
2161 sqlite3_stmt* ppstmt;
2162 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2163 { 1140 {
2164 db_error(ppdb, query); 1141 case 1: return part_of_speech::noun;
1142 case 2: return part_of_speech::verb;
1143 case 3: return part_of_speech::adjective;
1144 case 4: return part_of_speech::adverb;
1145 default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid));
2165 } 1146 }
1147 }
2166 1148
2167 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); 1149 notion& generator::createNotion(part_of_speech partOfSpeech)
2168 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); 1150 {
1151 notions_.emplace_back(partOfSpeech);
1152
1153 return notions_.back();
1154 }
2169 1155
2170 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1156 notion& generator::lookupOrCreateNotion(int wnid)
1157 {
1158 if (!notionByWnid_.count(wnid))
2171 { 1159 {
2172 db_error(ppdb, query); 1160 notions_.emplace_back(partOfSpeechByWnid(wnid), wnid);
1161 notionByWnid_[wnid] = &notions_.back();
2173 } 1162 }
2174 1163
2175 sqlite3_finalize(ppstmt); 1164 return *notionByWnid_.at(wnid);
2176 }
2177 }
2178
2179 // sim table
2180 {
2181 std::ifstream wnsimfile(wnpref + "wn_sim.pl");
2182 if (!wnsimfile.is_open())
2183 {
2184 std::cout << "Invalid WordNet data directory." << std::endl;
2185 print_usage();
2186 } 1165 }
2187 1166
2188 std::list<std::string> lines; 1167 lemma& generator::lookupOrCreateLemma(std::string base_form)
2189 for (;;)
2190 { 1168 {
2191 std::string line; 1169 if (!lemmaByBaseForm_.count(base_form))
2192 if (!getline(wnsimfile, line))
2193 { 1170 {
2194 break; 1171 lemmas_.emplace_back(lookupOrCreateForm(base_form));
1172 lemmaByBaseForm_[base_form] = &lemmas_.back();
2195 } 1173 }
1174
1175 return *lemmaByBaseForm_.at(base_form);
1176 }
2196 1177
2197 if (line.back() == '\r') 1178 form& generator::lookupOrCreateForm(std::string text)
1179 {
1180 if (!formByText_.count(text))
2198 { 1181 {
2199 line.pop_back(); 1182 forms_.emplace_back(text);
1183 formByText_[text] = &forms_.back();
2200 } 1184 }
2201 1185
2202 lines.push_back(line); 1186 return *formByText_[text];
2203 } 1187 }
2204 1188
2205 progress ppgs("Writing sense synonyms...", lines.size()); 1189 template <typename... Args> word& generator::createWord(Args&&... args)
2206 for (auto line : lines)
2207 { 1190 {
2208 ppgs.update(); 1191 words_.emplace_back(std::forward<Args>(args)...);
1192 word& w = words_.back();
2209 1193
2210 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); 1194 wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w);
2211 std::smatch relation_data; 1195
2212 if (!std::regex_search(line, relation_data, relation)) 1196 if (w.getNotion().hasWnid())
2213 { 1197 {
2214 continue; 1198 wordsByWnid_[w.getNotion().getWnid()].insert(&w);
2215 } 1199 }
2216 1200
2217 int synset_id_1 = stoi(relation_data[1]); 1201 return w;
2218 int synset_id_2 = stoi(relation_data[2]); 1202 }
2219 std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); 1203
1204 group& generator::createGroup(xmlNodePtr top)
1205 {
1206 groups_.emplace_back();
1207 group& grp = groups_.back();
2220 1208
2221 for (auto mapping1 : wn[synset_id_1]) 1209 xmlChar* key;
1210
1211 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
2222 { 1212 {
2223 for (auto mapping2 : wn[synset_id_2]) 1213 if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES")))
2224 { 1214 {
2225 sqlite3_stmt* ppstmt; 1215 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
2226 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2227 { 1216 {
2228 db_error(ppdb, query); 1217 if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS")))
1218 {
1219 try
1220 {
1221 group& subgrp = createGroup(subclass);
1222 subgrp.setParent(grp);
1223 } catch (const std::exception& e)
1224 {
1225 key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID"));
1226
1227 if (key == nullptr)
1228 {
1229 std::throw_with_nested(std::logic_error("Error parsing IDless subgroup"));
1230 } else {
1231 std::string subgroupId(reinterpret_cast<const char*>(key));
1232 xmlFree(key);
1233
1234 std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId));
1235 }
1236 }
1237 }
2229 } 1238 }
2230 1239 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS")))
2231 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1240 {
2232 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1241 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
2233
2234 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2235 { 1242 {
2236 db_error(ppdb, query); 1243 if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER")))
1244 {
1245 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn"));
1246 std::string wnSenses(reinterpret_cast<const char*>(key));
1247 xmlFree(key);
1248
1249 auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " ");
1250 if (!wnSenseKeys.empty())
1251 {
1252 std::list<std::string> tempKeys;
1253
1254 std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) {
1255 return sense + "::";
1256 });
1257
1258 std::list<std::string> filteredKeys;
1259
1260 std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) {
1261 return !wnSenseKeys_.count(sense);
1262 });
1263
1264 wnSenseKeys = std::move(filteredKeys);
1265 }
1266
1267 if (!wnSenseKeys.empty())
1268 {
1269 for (std::string sense : wnSenseKeys)
1270 {
1271 word& wordSense = *wnSenseKeys_[sense];
1272 wordSense.setVerbGroup(grp);
1273 }
1274 } else {
1275 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name"));
1276 std::string memberName(reinterpret_cast<const char*>(key));
1277 xmlFree(key);
1278
1279 notion& n = createNotion(part_of_speech::verb);
1280 lemma& l = lookupOrCreateLemma(memberName);
1281 word& w = createWord(n, l);
1282
1283 w.setVerbGroup(grp);
1284 }
1285 }
2237 } 1286 }
2238 1287 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES")))
2239 sqlite3_reset(ppstmt); 1288 {
2240 sqlite3_clear_bindings(ppstmt); 1289 for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next)
2241
2242 sqlite3_bind_int(ppstmt, 1, mapping2.second);
2243 sqlite3_bind_int(ppstmt, 2, mapping1.second);
2244
2245 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2246 { 1290 {
2247 db_error(ppdb, query); 1291 if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE")))
1292 {
1293 role r;
1294
1295 key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type"));
1296 std::string roleName = reinterpret_cast<const char*>(key);
1297 xmlFree(key);
1298
1299 for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
1300 {
1301 if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1302 {
1303 r.setSelrestrs(parseSelrestr(rolenode));
1304 }
1305 }
1306
1307 grp.addRole(roleName, std::move(r));
1308 }
2248 } 1309 }
1310 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES")))
1311 {
1312 for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next)
1313 {
1314 if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME")))
1315 {
1316 frames_.emplace_back();
1317 frame& fr = frames_.back();
2249 1318
2250 sqlite3_finalize(ppstmt); 1319 for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
1320 {
1321 if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX")))
1322 {
1323 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
1324 {
1325 if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP")))
1326 {
1327 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1328 std::string partRole = reinterpret_cast<const char*>(key);
1329 xmlFree(key);
1330
1331 selrestr partSelrestrs;
1332 std::set<std::string> partSynrestrs;
1333
1334 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1335 {
1336 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS")))
1337 {
1338 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1339 {
1340 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR")))
1341 {
1342 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1343 partSynrestrs.insert(reinterpret_cast<const char*>(key));
1344 xmlFree(key);
1345 }
1346 }
1347 }
1348
1349 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1350 {
1351 partSelrestrs = parseSelrestr(npnode);
1352 }
1353 }
1354
1355 fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs)));
1356 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB")))
1357 {
1358 fr.push_back(part::createVerb());
1359 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP")))
1360 {
1361 std::set<std::string> partChoices;
1362 bool partLiteral;
1363
1364 if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")))
1365 {
1366 partLiteral = true;
1367
1368 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1369 std::string choicesStr = reinterpret_cast<const char*>(key);
1370 xmlFree(key);
1371
1372 split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices)));
1373 } else {
1374 partLiteral = false;
1375
1376 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1377 {
1378 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1379 {
1380 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1381 {
1382 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1383 {
1384 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1385 partChoices.insert(reinterpret_cast<const char*>(key));
1386 xmlFree(key);
1387 }
1388 }
1389 }
1390 }
1391 }
1392
1393 fr.push_back(part::createPreposition(std::move(partChoices), partLiteral));
1394 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ")))
1395 {
1396 fr.push_back(part::createAdjective());
1397 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV")))
1398 {
1399 fr.push_back(part::createAdverb());
1400 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX")))
1401 {
1402 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1403 std::string literalValue = reinterpret_cast<const char*>(key);
1404 xmlFree(key);
1405
1406 fr.push_back(part::createLiteral(literalValue));
1407 } else {
1408 continue;
1409 }
1410 }
1411
1412 grp.addFrame(fr);
1413 }
1414 }
1415 }
1416 }
2251 } 1417 }
2252 } 1418 }
2253 }
2254 }
2255
2256 // syntax table
2257 {
2258 std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl");
2259 if (!wnsyntaxfile.is_open())
2260 {
2261 std::cout << "Invalid WordNet data directory." << std::endl;
2262 print_usage();
2263 }
2264 1419
2265 std::list<std::string> lines; 1420 return grp;
2266 for (;;)
2267 {
2268 std::string line;
2269 if (!getline(wnsyntaxfile, line))
2270 {
2271 break;
2272 }
2273
2274 if (line.back() == '\r')
2275 {
2276 line.pop_back();
2277 }
2278
2279 lines.push_back(line);
2280 } 1421 }
2281 1422
2282 progress ppgs("Writing adjective syntax markers...", lines.size()); 1423 selrestr generator::parseSelrestr(xmlNodePtr top)
2283 for (auto line : lines)
2284 { 1424 {
2285 ppgs.update(); 1425 xmlChar* key;
2286 1426
2287 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); 1427 if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
2288 std::smatch relation_data;
2289 if (!std::regex_search(line, relation_data, relation))
2290 {
2291 continue;
2292 }
2293
2294 int synset_id = stoi(relation_data[1]);
2295 int wnum = stoi(relation_data[2]);
2296 std::string syn = relation_data[3];
2297 std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?");
2298
2299 sqlite3_stmt* ppstmt;
2300 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2301 { 1428 {
2302 db_error(ppdb, query); 1429 if (xmlChildElementCount(top) == 0)
2303 } 1430 {
2304 1431 return {};
2305 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); 1432 } else if (xmlChildElementCount(top) == 1)
2306 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); 1433 {
2307 1434 return parseSelrestr(xmlFirstElementChild(top));
2308 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1435 } else {
1436 bool orlogic = false;
1437 if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic")))
1438 {
1439 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic"));
1440 if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or")))
1441 {
1442 orlogic = true;
1443 }
1444
1445 xmlFree(key);
1446 }
1447
1448 std::list<selrestr> children;
1449 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next)
1450 {
1451 if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))
1452 || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1453 {
1454 children.push_back(parseSelrestr(selrestr));
1455 }
1456 }
1457
1458 return selrestr(children, orlogic);
1459 }
1460 } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
2309 { 1461 {
2310 db_error(ppdb, query); 1462 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value"));
1463 bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+");
1464 xmlFree(key);
1465
1466 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type"));
1467 std::string selRestriction = reinterpret_cast<const char*>(key);
1468 xmlFree(key);
1469
1470 return selrestr(selRestriction, selPos);
1471 } else {
1472 throw std::logic_error("Badly formatted selrestr");
2311 } 1473 }
2312
2313 sqlite3_finalize(ppstmt);
2314 } 1474 }
2315 } 1475
2316 1476 };
2317 sqlite3_close_v2(ppdb); 1477};
2318
2319 std::cout << "Done." << std::endl;
2320}