summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2017-01-16 18:02:50 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2017-01-16 18:02:50 -0500
commit6746da6edd7d9d50efe374eabbb79a3cac882d81 (patch)
treeff20917e08b08d36b9541c1371106596e7bec442 /generator/generator.cpp
parent4af7e55733098ca42f75a4ffaca1b0f6bab4dd36 (diff)
downloadverbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.gz
verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.bz2
verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.zip
Started structural rewrite
The new object structure was designed to build on the existing WordNet
structure, while also adding in all of the data that we get from other sources.
More information about this can be found on the project wiki.

The generator has already been completely rewritten to generate a
datafile that uses the new structure. In addition, a number of indexes
are created, which does double the size of the datafile, but also allows
for much faster lookups. Finally, the new generator is written modularly
and is a lot more readable than the old one.

The verbly interface to the new object structure has mostly been
completed, but has not been tested fully. There is a completely new
search API which utilizes a lot of operator overloading; documentation
on how to use it should go up at some point.

Token processing and verb frames are currently unimplemented. Source for
these have been left in the repository for now.
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp3145
1 files changed, 1151 insertions, 1994 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,2320 +1,1477 @@
1#include <libxml/parser.h> 1#include "generator.h"
2#include <cassert>
3#include <stdexcept>
2#include <iostream> 4#include <iostream>
5#include <regex>
3#include <dirent.h> 6#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream> 7#include <fstream>
9#include <sqlite3.h> 8#include "enums.h"
10#include <sstream>
11#include <regex>
12#include <list>
13#include <algorithm>
14#include <json.hpp>
15#include "progress.h" 9#include "progress.h"
10#include "selrestr.h"
11#include "role.h"
12#include "part.h"
13#include "field.h"
16#include "../lib/util.h" 14#include "../lib/util.h"
17 15
18using json = nlohmann::json; 16namespace verbly {
19 17 namespace generator {
20struct verb_t {
21 std::string infinitive;
22 std::string past_tense;
23 std::string past_participle;
24 std::string ing_form;
25 std::string s_form;
26 int id;
27};
28
29struct adjective_t {
30 std::string base;
31 std::string comparative;
32 std::string superlative;
33};
34
35struct noun_t {
36 std::string singular;
37 std::string plural;
38};
39
40struct selrestr_t {
41 enum class type_t {
42 singleton,
43 andlogic,
44 orlogic,
45 empty
46 };
47 type_t type;
48 std::string restriction;
49 bool pos;
50 std::list<selrestr_t> subordinates;
51};
52
53struct framepart_t {
54 enum class type_t {
55 np,
56 v,
57 pp,
58 adj,
59 adv,
60 lex
61 };
62 type_t type;
63 std::string role;
64 selrestr_t selrestrs;
65 std::set<std::string> preprestrs;
66 std::set<std::string> synrestrs;
67 std::list<std::string> choices;
68 std::string lexval;
69};
70
71struct group_t {
72 std::string id;
73 std::string parent;
74 std::set<std::string> members;
75 std::map<std::string, selrestr_t> roles;
76 std::list<std::list<framepart_t>> frames;
77};
78
79struct pronunciation_t {
80 std::string phonemes;
81 std::string prerhyme;
82 std::string rhyme;
83 int syllables = 0;
84 std::string stress;
85
86 bool operator<(const pronunciation_t& other) const
87 {
88 return phonemes < other.phonemes;
89 }
90};
91
92std::map<std::string, group_t> groups;
93std::map<std::string, verb_t> verbs;
94std::map<std::string, adjective_t> adjectives;
95std::map<std::string, noun_t> nouns;
96std::map<int, std::map<int, int>> wn;
97std::map<int, int> images;
98std::map<std::string, std::set<pronunciation_t>> pronunciations;
99
100void print_usage()
101{
102 std::cout << "Verbly Datafile Generator" << std::endl;
103 std::cout << "-------------------------" << std::endl;
104 std::cout << "Requires exactly six arguments." << std::endl;
105 std::cout << "1. The path to a VerbNet data directory." << std::endl;
106 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
107 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
108 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
109 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
110 std::cout << "6. Datafile output path." << std::endl;
111
112 exit(1);
113}
114
115void db_error(sqlite3* ppdb, std::string query)
116{
117 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
118 std::cout << query << std::endl;
119 sqlite3_close_v2(ppdb);
120 print_usage();
121}
122
123json export_selrestrs(selrestr_t r)
124{
125 if (r.type == selrestr_t::type_t::empty)
126 {
127 return {};
128 } else if (r.type == selrestr_t::type_t::singleton)
129 {
130 json result;
131 result["type"] = r.restriction;
132 result["pos"] = r.pos;
133 return result;
134 } else {
135 json result;
136 if (r.type == selrestr_t::type_t::andlogic)
137 {
138 result["logic"] = "and";
139 } else {
140 result["logic"] = "or";
141 }
142
143 std::list<json> outlist;
144 std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs);
145 result["children"] = outlist;
146 18
147 return result; 19 generator::generator(
148 } 20 std::string verbNetPath,
149} 21 std::string agidPath,
150 22 std::string wordNetPath,
151selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) 23 std::string cmudictPath,
152{ 24 std::string imageNetPath,
153 selrestr_t r; 25 std::string outputPath) :
154 xmlChar* key; 26 verbNetPath_(verbNetPath),
155 27 agidPath_(agidPath),
156 if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) 28 wordNetPath_(wordNetPath),
157 { 29 cmudictPath_(cmudictPath),
158 if (xmlChildElementCount(top) == 0) 30 imageNetPath_(imageNetPath),
31 db_(outputPath)
159 { 32 {
160 r.type = selrestr_t::type_t::empty; 33 // Ensure VerbNet directory exists
161 } else if (xmlChildElementCount(top) == 1) 34 DIR* dir;
162 { 35 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
163 r = parse_selrestrs(xmlFirstElementChild(top), filename);
164 } else {
165 r.type = selrestr_t::type_t::andlogic;
166
167 if (xmlHasProp(top, (const xmlChar*) "logic"))
168 { 36 {
169 key = xmlGetProp(top, (const xmlChar*) "logic"); 37 throw std::invalid_argument("Invalid VerbNet data directory");
170 if (!xmlStrcmp(key, (const xmlChar*) "or"))
171 {
172 r.type = selrestr_t::type_t::orlogic;
173 }
174 xmlFree(key);
175 } 38 }
176 39
177 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) 40 closedir(dir);
41
42 // Ensure AGID infl.txt exists
43 if (!std::ifstream(agidPath_))
178 { 44 {
179 if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) 45 throw std::invalid_argument("AGID infl.txt file not found");
180 {
181 r.subordinates.push_back(parse_selrestrs(selrestr, filename));
182 }
183 } 46 }
184 } 47
185 } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) 48 // Add directory separator to WordNet path
186 { 49 if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\'))
187 r.type = selrestr_t::type_t::singleton;
188
189 key = xmlGetProp(top, (xmlChar*) "Value");
190 r.pos = (std::string((const char*)key) == "+");
191 xmlFree(key);
192
193 key = xmlGetProp(top, (xmlChar*) "type");
194 r.restriction = (const char*) key;
195 xmlFree(key);
196 } else {
197 // Invalid
198 std::cout << "Bad VerbNet file format: " << filename << std::endl;
199 print_usage();
200 }
201
202 return r;
203}
204
205group_t& parse_group(xmlNodePtr top, std::string filename)
206{
207 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
208 if (key == 0)
209 {
210 std::cout << "Bad VerbNet file format: " << filename << std::endl;
211 print_usage();
212 }
213 std::string vnid = (const char*)key;
214 vnid = vnid.substr(vnid.find_first_of("-")+1);
215 xmlFree(key);
216
217 group_t g;
218 g.id = vnid;
219
220 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
221 {
222 if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES"))
223 {
224 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
225 { 50 {
226 if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) 51 wordNetPath_ += '/';
227 {
228 auto& sg = parse_group(subclass, filename);
229 sg.parent = vnid;
230
231 for (auto member : sg.members)
232 {
233 g.members.insert(member);
234 }
235
236 // The schema requires that subclasses appear after role definitions, so we can do this now
237 for (auto role : g.roles)
238 {
239 if (sg.roles.count(role.first) == 0)
240 {
241 sg.roles[role.first] = role.second;
242 }
243 }
244 }
245 } 52 }
246 } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) 53
247 { 54 // Ensure WordNet tables exist
248 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) 55 for (std::string table : {
56 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"
57 })
249 { 58 {
250 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) 59 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl"))
251 { 60 {
252 key = xmlGetProp(member, (xmlChar*) "name"); 61 throw std::invalid_argument("WordNet " + table + " table not found");
253 g.members.insert((const char*)key);
254 xmlFree(key);
255 } 62 }
256 } 63 }
257 } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) 64
258 { 65 // Ensure CMUDICT file exists
259 for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) 66 if (!std::ifstream(cmudictPath_))
260 { 67 {
261 if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) 68 throw std::invalid_argument("CMUDICT file not found");
262 {
263 selrestr_t r;
264 r.type = selrestr_t::type_t::empty;
265
266 key = xmlGetProp(role, (const xmlChar*) "type");
267 std::string type = (const char*)key;
268 xmlFree(key);
269
270 for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
271 {
272 if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS"))
273 {
274 r = parse_selrestrs(rolenode, filename);
275 }
276 }
277
278 g.roles[type] = r;
279 }
280 } 69 }
281 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) 70
282 { 71 // Ensure ImageNet urls.txt exists
283 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) 72 if (!std::ifstream(imageNetPath_))
284 { 73 {
285 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) 74 throw std::invalid_argument("ImageNet urls.txt file not found");
286 {
287 std::list<framepart_t> f;
288
289 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
290 {
291 if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX"))
292 {
293 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
294 {
295 framepart_t fp;
296
297 if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP"))
298 {
299 fp.type = framepart_t::type_t::np;
300
301 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
302 fp.role = (const char*)key;
303 xmlFree(key);
304
305 fp.selrestrs.type = selrestr_t::type_t::empty;
306
307 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
308 {
309 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS"))
310 {
311 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
312 {
313 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR"))
314 {
315 key = xmlGetProp(synrestr, (xmlChar*) "type");
316 fp.synrestrs.insert(std::string((const char*)key));
317 xmlFree(key);
318 }
319 }
320 }
321
322 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
323 {
324 fp.selrestrs = parse_selrestrs(npnode, filename);
325 }
326 }
327 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB"))
328 {
329 fp.type = framepart_t::type_t::v;
330 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP"))
331 {
332 fp.type = framepart_t::type_t::pp;
333
334 if (xmlHasProp(syntaxnode, (xmlChar*) "value"))
335 {
336 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
337 std::string choices = (const char*)key;
338 xmlFree(key);
339
340 fp.choices = verbly::split<std::list<std::string>>(choices, " ");
341 }
342
343 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
344 {
345 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
346 {
347 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
348 {
349 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR"))
350 {
351 key = xmlGetProp(synrestr, (xmlChar*) "type");
352 fp.preprestrs.insert(std::string((const char*)key));
353 xmlFree(key);
354 }
355 }
356 }
357 }
358 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ"))
359 {
360 fp.type = framepart_t::type_t::adj;
361 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV"))
362 {
363 fp.type = framepart_t::type_t::adv;
364 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX"))
365 {
366 fp.type = framepart_t::type_t::lex;
367
368 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
369 fp.lexval = (const char*)key;
370 xmlFree(key);
371 } else {
372 continue;
373 }
374
375 f.push_back(fp);
376 }
377
378 g.frames.push_back(f);
379 }
380 }
381 }
382 } 75 }
383 } 76 }
384 }
385
386 groups[vnid] = g;
387
388 return groups[vnid];
389}
390
391int main(int argc, char** argv)
392{
393 if (argc != 7)
394 {
395 print_usage();
396 }
397
398 // VerbNet data
399 std::cout << "Reading verb frames..." << std::endl;
400
401 DIR* dir;
402 if ((dir = opendir(argv[1])) == nullptr)
403 {
404 std::cout << "Invalid VerbNet data directory." << std::endl;
405
406 print_usage();
407 }
408
409 struct dirent* ent;
410 while ((ent = readdir(dir)) != nullptr)
411 {
412 std::string filename(argv[1]);
413 if (filename.back() != '/')
414 {
415 filename += '/';
416 }
417 77
418 filename += ent->d_name; 78 void generator::run()
419 //std::cout << ent->d_name << std::endl;
420
421 if (filename.rfind(".xml") != filename.size() - 4)
422 {
423 continue;
424 }
425
426 xmlDocPtr doc = xmlParseFile(filename.c_str());
427 if (doc == nullptr)
428 {
429 std::cout << "Error opening " << filename << std::endl;
430 print_usage();
431 }
432
433 xmlNodePtr top = xmlDocGetRootElement(doc);
434 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
435 {
436 std::cout << "Bad VerbNet file format: " << filename << std::endl;
437 print_usage();
438 }
439
440 parse_group(top, filename);
441 }
442
443 closedir(dir);
444
445 // Get verbs from AGID
446 std::cout << "Reading inflections..." << std::endl;
447
448 std::ifstream agidfile(argv[2]);
449 if (!agidfile.is_open())
450 {
451 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
452 print_usage();
453 }
454
455 for (;;)
456 {
457 std::string line;
458 if (!getline(agidfile, line))
459 {
460 break;
461 }
462
463 if (line.back() == '\r')
464 { 79 {
465 line.pop_back(); 80 // Create notions, words, lemmas, and forms from WordNet synsets
466 } 81 readWordNetSynsets();
467 82
468 int divider = line.find_first_of(" "); 83 // Reads adjective positioning WordNet data
469 std::string word = line.substr(0, divider); 84 readAdjectivePositioning();
470 line = line.substr(divider+1); 85
471 char type = line[0]; 86 // Counts the number of URLs ImageNet has per notion
472 87 readImageNetUrls();
473 if (line[1] == '?') 88
474 { 89 // Creates a word by WordNet sense key lookup table
475 line.erase(0, 4); 90 readWordNetSenseKeys();
476 } else { 91
477 line.erase(0, 3); 92 // Creates groups and frames from VerbNet data
478 } 93 readVerbNet();
479 94
480 std::vector<std::string> forms; 95 // Creates forms and inflections from AGID. To reduce the amount of forms
481 while (!line.empty()) 96 // created, we do this after most lemmas that need inflecting have been
482 { 97 // created through other means, and then only generate forms for
483 std::string inflection; 98 // inflections of already-existing lemmas. The exception to this regards
484 if ((divider = line.find(" | ")) != std::string::npos) 99 // verb lemmas. If a verb lemma in AGID either does not exist yet, or does
485 { 100 // exist but is not related to any words that are related to verb notions,
486 inflection = line.substr(0, divider); 101 // then a notion and a word is generated and the form generation proceeds
487 line = line.substr(divider + 3); 102 // as usual.
488 } else { 103 readAgidInflections();
489 inflection = line; 104
490 line = ""; 105 // Reads in prepositions and the is_a relationship
491 } 106 readPrepositions();
492 107
493 if ((divider = inflection.find_first_of(",?")) != std::string::npos) 108 // Creates pronunciations from CMUDICT. To reduce the amount of
494 { 109 // pronunciations created, we do this after all forms have been created,
495 inflection = inflection.substr(0, divider); 110 // and then only generate pronunciations for already-exisiting forms.
496 } 111 readCmudictPronunciations();
497 112
498 forms.push_back(inflection); 113 // Writes the database schema
114 writeSchema();
115
116 // Dumps data to the database
117 dumpObjects();
118
119 // Populates the antonymy relationship from WordNet
120 readWordNetAntonymy();
121
122 // Populates the variation relationship from WordNet
123 readWordNetVariation();
124
125 // Populates the usage, topicality, and regionality relationships from
126 // WordNet
127 readWordNetClasses();
128
129 // Populates the causality relationship from WordNet
130 readWordNetCausality();
131
132 // Populates the entailment relationship from WordNet
133 readWordNetEntailment();
134
135 // Populates the hypernymy relationship from WordNet
136 readWordNetHypernymy();
137
138 // Populates the instantiation relationship from WordNet
139 readWordNetInstantiation();
140
141 // Populates the member meronymy relationship from WordNet
142 readWordNetMemberMeronymy();
143
144 // Populates the part meronymy relationship from WordNet
145 readWordNetPartMeronymy();
146
147 // Populates the substance meronymy relationship from WordNet
148 readWordNetSubstanceMeronymy();
149
150 // Populates the pertainymy and mannernymy relationships from WordNet
151 readWordNetPertainymy();
152
153 // Populates the specification relationship from WordNet
154 readWordNetSpecification();
155
156 // Populates the adjective similarity relationship from WordNet
157 readWordNetSimilarity();
158
159
160
161
162
163
164
165
499 } 166 }
500 167
501 switch (type) 168 void generator::readWordNetSynsets()
502 { 169 {
503 case 'V': 170 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
171 progress ppgs("Reading synsets from WordNet...", lines.size());
172
173 for (std::string line : lines)
504 { 174 {
505 verb_t v; 175 ppgs.update();
506 v.infinitive = word; 176
507 if (forms.size() == 4) 177 std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
508 { 178 std::smatch relation_data;
509 v.past_tense = forms[0]; 179 if (!std::regex_search(line, relation_data, relation))
510 v.past_participle = forms[1]; 180 {
511 v.ing_form = forms[2]; 181 continue;
512 v.s_form = forms[3];
513 } else if (forms.size() == 3)
514 {
515 v.past_tense = forms[0];
516 v.past_participle = forms[0];
517 v.ing_form = forms[1];
518 v.s_form = forms[2];
519 } else if (forms.size() == 8)
520 {
521 // As of AGID 2014.08.11, this is only "to be"
522 v.past_tense = forms[0];
523 v.past_participle = forms[2];
524 v.ing_form = forms[3];
525 v.s_form = forms[4];
526 } else {
527 // Words that don't fit the cases above as of AGID 2014.08.11:
528 // - may and shall do not conjugate the way we want them to
529 // - methinks only has a past tense and is an outlier
530 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
531 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
532 } 182 }
533 183
534 verbs[word] = v; 184 int synset_id = std::stoi(relation_data[1]);
535 185 int wnum = std::stoi(relation_data[2]);
536 break; 186 std::string text = relation_data[3];
537 } 187 int tag_count = std::stoi(relation_data[4]);
538 188 size_t word_it;
539 case 'A': 189 while ((word_it = text.find("''")) != std::string::npos)
540 {
541 adjective_t adj;
542 adj.base = word;
543 if (forms.size() == 2)
544 { 190 {
545 adj.comparative = forms[0]; 191 text.erase(word_it, 1);
546 adj.superlative = forms[1];
547 } else {
548 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
549 std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl;
550 } 192 }
551 193
552 adjectives[word] = adj; 194 // The WordNet data does contain duplicates, so we need to check that we
553 195 // haven't already created this word.
554 break; 196 std::pair<int, int> lookup(synset_id, wnum);
555 } 197 if (!wordByWnidAndWnum_.count(lookup))
556
557 case 'N':
558 {
559 noun_t n;
560 n.singular = word;
561 if (forms.size() == 1)
562 { 198 {
563 n.plural = forms[0]; 199 notion& synset = lookupOrCreateNotion(synset_id);
564 } else { 200 lemma& lex = lookupOrCreateLemma(text);
565 // As of AGID 2014.08.11, this is non-existent. 201 word& entry = createWord(synset, lex, tag_count);
566 std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; 202
203 wordByWnidAndWnum_[lookup] = &entry;
567 } 204 }
568
569 nouns[word] = n;
570
571 break;
572 } 205 }
573 } 206 }
574 }
575
576 // Pronounciations
577 std::cout << "Reading pronunciations..." << std::endl;
578
579 std::ifstream pronfile(argv[4]);
580 if (!pronfile.is_open())
581 {
582 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
583 print_usage();
584 }
585
586 for (;;)
587 {
588 std::string line;
589 if (!getline(pronfile, line))
590 {
591 break;
592 }
593
594 if (line.back() == '\r')
595 {
596 line.pop_back();
597 }
598 207
599 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); 208 void generator::readAdjectivePositioning()
600 std::smatch phoneme_data;
601 if (std::regex_search(line, phoneme_data, phoneme))
602 { 209 {
603 std::string canonical(phoneme_data[1]); 210 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
604 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); 211 progress ppgs("Reading adjective positionings from WordNet...", lines.size());
605
606 std::string phonemes = phoneme_data[2];
607 auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " ");
608 auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) {
609 return phoneme.find("1") != std::string::npos;
610 });
611 212
612 pronunciation_t p; 213 for (std::string line : lines)
613 p.phonemes = phonemes;
614
615 // Rhyme detection
616 if (phemstrt != std::end(phoneme_set))
617 { 214 {
618 std::stringstream rhymer; 215 ppgs.update();
619 for (auto it = phemstrt; it != std::end(phoneme_set); it++)
620 {
621 std::string naked;
622 std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) {
623 return isdigit(ch);
624 });
625
626 if (it != phemstrt)
627 {
628 rhymer << " ";
629 }
630
631 rhymer << naked;
632 }
633 216
634 p.rhyme = rhymer.str(); 217 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
635 218 std::smatch relation_data;
636 if (phemstrt != std::begin(phoneme_set)) 219 if (!std::regex_search(line, relation_data, relation))
637 { 220 {
638 phemstrt--; 221 continue;
639 p.prerhyme = *phemstrt;
640 } else {
641 p.prerhyme = "";
642 } 222 }
643 } else {
644 p.prerhyme = "";
645 p.rhyme = "";
646 }
647 223
648 // Syllable/stress 224 int synset_id = stoi(relation_data[1]);
649 for (auto phm : phoneme_set) 225 int wnum = stoi(relation_data[2]);
650 { 226 std::string adjpos_str = relation_data[3];
651 if (isdigit(phm.back()))
652 {
653 // It's a vowel!
654 p.syllables++;
655 227
656 if (phm.back() == '1') 228 std::pair<int, int> lookup(synset_id, wnum);
229 if (wordByWnidAndWnum_.count(lookup))
230 {
231 word& adj = *wordByWnidAndWnum_.at(lookup);
232
233 if (adjpos_str == "p")
234 {
235 adj.setAdjectivePosition(positioning::predicate);
236 } else if (adjpos_str == "a")
237 {
238 adj.setAdjectivePosition(positioning::attributive);
239 } else if (adjpos_str == "i")
657 { 240 {
658 p.stress.push_back('1'); 241 adj.setAdjectivePosition(positioning::postnominal);
659 } else { 242 } else {
660 p.stress.push_back('0'); 243 // Can't happen because of how we specified the regex.
244 assert(false);
661 } 245 }
662 } 246 }
663 } 247 }
664
665 pronunciations[canonical].insert(p);
666 }
667 }
668
669 // Images
670 std::cout << "Reading images..." << std::endl;
671
672 std::ifstream imagefile(argv[5]);
673 if (!imagefile.is_open())
674 {
675 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
676 print_usage();
677 }
678
679 for (;;)
680 {
681 std::string line;
682 if (!getline(imagefile, line))
683 {
684 break;
685 }
686
687 if (line.back() == '\r')
688 {
689 line.pop_back();
690 }
691
692 std::string wnid_s = line.substr(1, 8);
693 int wnid = stoi(wnid_s) + 100000000;
694 images[wnid]++;
695 }
696
697 imagefile.close();
698
699 // Start writing output
700 std::cout << "Writing schema..." << std::endl;
701
702 sqlite3* ppdb;
703 if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
704 {
705 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
706 print_usage();
707 }
708
709 std::ifstream schemafile("schema.sql");
710 if (!schemafile.is_open())
711 {
712 std::cout << "Could not find schema file" << std::endl;
713 print_usage();
714 }
715
716 std::stringstream schemabuilder;
717 for (;;)
718 {
719 std::string line;
720 if (!getline(schemafile, line))
721 {
722 break;
723 }
724
725 if (line.back() == '\r')
726 {
727 line.pop_back();
728 }
729
730 schemabuilder << line << std::endl;
731 }
732
733 std::string schema = schemabuilder.str();
734 while (!schema.empty())
735 {
736 std::string query;
737 int divider = schema.find(";");
738 if (divider != std::string::npos)
739 {
740 query = schema.substr(0, divider+1);
741 schema = schema.substr(divider+2);
742 } else {
743 break;
744 } 248 }
745 249
746 sqlite3_stmt* schmstmt; 250 void generator::readImageNetUrls()
747 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
748 { 251 {
749 db_error(ppdb, query); 252 // The ImageNet datafile is so large that it is unreasonable and
750 } 253 // unnecessary to read it into memory; instead, we will parse each line as
751 254 // we read it. This has the caveat that we cannot display a progress bar.
752 if (sqlite3_step(schmstmt) != SQLITE_DONE) 255 std::cout << "Reading image counts from ImageNet..." << std::endl;
753 {
754 db_error(ppdb, query);
755 }
756
757 sqlite3_finalize(schmstmt);
758 }
759
760 std::cout << "Writing prepositions..." << std::endl;
761 std::ifstream prepfile("prepositions.txt");
762 if (!prepfile.is_open())
763 {
764 std::cout << "Could not find prepositions file" << std::endl;
765 print_usage();
766 }
767
768 for (;;)
769 {
770 std::string line;
771 if (!getline(prepfile, line))
772 {
773 break;
774 }
775
776 if (line.back() == '\r')
777 {
778 line.pop_back();
779 }
780
781 std::regex relation("^([^:]+): (.+)");
782 std::smatch relation_data;
783 std::regex_search(line, relation_data, relation);
784 std::string prep = relation_data[1];
785 std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", ");
786
787 std::string query("INSERT INTO prepositions (form) VALUES (?)");
788 sqlite3_stmt* ppstmt;
789
790 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
791 {
792 db_error(ppdb, query);
793 }
794
795 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT);
796
797 if (sqlite3_step(ppstmt) != SQLITE_DONE)
798 {
799 db_error(ppdb, query);
800 }
801
802 sqlite3_finalize(ppstmt);
803
804 query = "SELECT last_insert_rowid()";
805 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
806 {
807 db_error(ppdb, query);
808 }
809
810 if (sqlite3_step(ppstmt) != SQLITE_ROW)
811 {
812 db_error(ppdb, query);
813 }
814
815 int rowid = sqlite3_column_int(ppstmt, 0);
816 sqlite3_finalize(ppstmt);
817
818 for (auto group : groups)
819 {
820 query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)";
821 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
822 {
823 db_error(ppdb, query);
824 }
825 256
826 sqlite3_bind_int(ppstmt, 1, rowid); 257 std::ifstream file(imageNetPath_);
827 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); 258 if (!file)
828
829 if (sqlite3_step(ppstmt) != SQLITE_DONE)
830 { 259 {
831 db_error(ppdb, query); 260 throw std::invalid_argument("Could not find file " + imageNetPath_);
832 } 261 }
833
834 sqlite3_finalize(ppstmt);
835 }
836 }
837
838 262
839 { 263 std::string line;
840 progress ppgs("Writing verbs...", verbs.size()); 264 while (std::getline(file, line))
841 for (auto& mapping : verbs)
842 {
843 sqlite3_stmt* ppstmt;
844 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
845 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
846 {
847 db_error(ppdb, query);
848 }
849
850 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT);
851 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT);
852 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT);
853 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT);
854 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT);
855
856 if (sqlite3_step(ppstmt) != SQLITE_DONE)
857 {
858 db_error(ppdb, query);
859 }
860
861 sqlite3_finalize(ppstmt);
862
863 std::string canonical(mapping.second.infinitive);
864 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
865 if (pronunciations.count(canonical) == 1)
866 { 265 {
867 query = "SELECT last_insert_rowid()"; 266 if (line.back() == '\r')
868 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
869 { 267 {
870 db_error(ppdb, query); 268 line.pop_back();
871 } 269 }
872 270
873 if (sqlite3_step(ppstmt) != SQLITE_ROW) 271 std::string wnid_s = line.substr(1, 8);
272 int wnid = stoi(wnid_s) + 100000000;
273 if (notionByWnid_.count(wnid))
874 { 274 {
875 db_error(ppdb, query); 275 // We know that this notion has a wnid and is a noun.
876 } 276 notionByWnid_.at(wnid)->incrementNumOfImages();
877
878 int rowid = sqlite3_column_int(ppstmt, 0);
879
880 sqlite3_finalize(ppstmt);
881
882 mapping.second.id = rowid;
883
884 for (auto pronunciation : pronunciations[canonical])
885 {
886 if (!pronunciation.rhyme.empty())
887 {
888 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
889 } else {
890 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
891 }
892
893 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
894 {
895 db_error(ppdb, query);
896 }
897
898 sqlite3_bind_int(ppstmt, 1, rowid);
899 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
900 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
901 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
902
903 if (!pronunciation.rhyme.empty())
904 {
905 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
906 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
907 }
908
909 if (sqlite3_step(ppstmt) != SQLITE_DONE)
910 {
911 db_error(ppdb, query);
912 }
913
914 sqlite3_finalize(ppstmt);
915 } 277 }
916 } 278 }
917
918 ppgs.update();
919 } 279 }
920 } 280
921 281 void generator::readWordNetSenseKeys()
922 {
923 progress ppgs("Writing verb frames...", groups.size());
924 for (auto& mapping : groups)
925 { 282 {
926 std::list<json> roledatal; 283 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
927 std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { 284 progress ppgs("Reading sense keys from WordNet...", lines.size());
928 json role;
929 role["type"] = r.first;
930 role["selrestrs"] = export_selrestrs(r.second);
931
932 return role;
933 });
934
935 json roledata(roledatal);
936 std::string rdm = roledata.dump();
937
938 sqlite3_stmt* ppstmt;
939 std::string query("INSERT INTO groups (data) VALUES (?)");
940 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
941 {
942 db_error(ppdb, query);
943 }
944
945 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT);
946
947 if (sqlite3_step(ppstmt) != SQLITE_DONE)
948 {
949 db_error(ppdb, query);
950 }
951 285
952 sqlite3_finalize(ppstmt); 286 for (std::string line : lines)
953
954 query = "SELECT last_insert_rowid()";
955 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
956 {
957 db_error(ppdb, query);
958 }
959
960 if (sqlite3_step(ppstmt) != SQLITE_ROW)
961 {
962 db_error(ppdb, query);
963 }
964
965 int gid = sqlite3_column_int(ppstmt, 0);
966 sqlite3_finalize(ppstmt);
967
968 for (auto frame : mapping.second.frames)
969 { 287 {
970 std::list<json> fdatap; 288 ppgs.update();
971 std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) {
972 json part;
973
974 switch (fp.type)
975 {
976 case framepart_t::type_t::np:
977 {
978 part["type"] = "np";
979 part["role"] = fp.role;
980 part["selrestrs"] = export_selrestrs(fp.selrestrs);
981 part["synrestrs"] = fp.synrestrs;
982
983 break;
984 }
985
986 case framepart_t::type_t::pp:
987 {
988 part["type"] = "pp";
989 part["values"] = fp.choices;
990 part["preprestrs"] = fp.preprestrs;
991
992 break;
993 }
994
995 case framepart_t::type_t::v:
996 {
997 part["type"] = "v";
998
999 break;
1000 }
1001
1002 case framepart_t::type_t::adj:
1003 {
1004 part["type"] = "adj";
1005
1006 break;
1007 }
1008
1009 case framepart_t::type_t::adv:
1010 {
1011 part["type"] = "adv";
1012
1013 break;
1014 }
1015
1016 case framepart_t::type_t::lex:
1017 {
1018 part["type"] = "lex";
1019 part["value"] = fp.lexval;
1020
1021 break;
1022 }
1023 }
1024
1025 return part;
1026 });
1027
1028 json fdata(fdatap);
1029 std::string marshall = fdata.dump();
1030
1031 query = "INSERT INTO frames (group_id, data) VALUES (?, ?)";
1032 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1033 {
1034 db_error(ppdb, query);
1035 }
1036
1037 sqlite3_bind_int(ppstmt, 1, gid);
1038 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT);
1039 289
1040 if (sqlite3_step(ppstmt) != SQLITE_DONE) 290 // We only actually need to lookup verbs by sense key so we'll just
291 // ignore everything that isn't a verb.
292 std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$");
293 std::smatch relation_data;
294 if (!std::regex_search(line, relation_data, relation))
1041 { 295 {
1042 db_error(ppdb, query); 296 continue;
1043 } 297 }
298
299 int synset_id = stoi(relation_data[1]);
300 int wnum = stoi(relation_data[2]);
301 std::string sense_key = relation_data[3];
1044 302
1045 sqlite3_finalize(ppstmt); 303 // We are treating this mapping as injective, which is not entirely
1046 } 304 // accurate. First, the WordNet table contains duplicate rows, so those
1047 305 // need to be ignored. More importantly, a small number of sense keys
1048 for (auto member : mapping.second.members) 306 // (one for each letter of the Latin alphabet, plus 9 other words) each
1049 { 307 // map to two different words in the same synset which differ only by
1050 if (verbs.count(member) == 1) 308 // capitalization. Luckily, none of these exceptions are verbs, so we
309 // can pretend that the mapping is injective.
310 if (!wnSenseKeys_.count(sense_key))
1051 { 311 {
1052 auto& v = verbs[member]; 312 std::pair<int, int> lookup(synset_id, wnum);
1053 313 if (wordByWnidAndWnum_.count(lookup))
1054 query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)";
1055 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1056 {
1057 db_error(ppdb, query);
1058 }
1059
1060 sqlite3_bind_int(ppstmt, 1, v.id);
1061 sqlite3_bind_int(ppstmt, 2, gid);
1062
1063 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1064 { 314 {
1065 db_error(ppdb, query); 315 wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup);
1066 } 316 }
1067
1068 sqlite3_finalize(ppstmt);
1069 } 317 }
1070 } 318 }
1071
1072 ppgs.update();
1073 } 319 }
1074 } 320
1075 321 void generator::readVerbNet()
1076 // Get nouns/adjectives/adverbs from WordNet
1077 // Useful relations:
1078 // - s: master list
1079 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
1080 // - at: variation (e.g. a measurement can be standard or nonstandard)
1081 // - der: derivation (e.g. happy/happily, happily/happy)
1082 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
1083 // - ins: instantiation (do we need this? let's see)
1084 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
1085 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
1086 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
1087 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
1088 // mannernymy (e.g. something done quickly is done in a manner that is quick)
1089 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
1090 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
1091 // - syntax: positioning flags for some adjectives
1092 std::string wnpref {argv[3]};
1093 if (wnpref.back() != '/')
1094 {
1095 wnpref += '/';
1096 }
1097
1098 // s table
1099 {
1100 std::ifstream wnsfile(wnpref + "wn_s.pl");
1101 if (!wnsfile.is_open())
1102 { 322 {
1103 std::cout << "Invalid WordNet data directory." << std::endl; 323 std::cout << "Reading frames from VerbNet..." << std::endl;
1104 print_usage();
1105 }
1106 324
1107 std::list<std::string> lines; 325 DIR* dir;
1108 for (;;) 326 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
1109 {
1110 std::string line;
1111 if (!getline(wnsfile, line))
1112 { 327 {
1113 break; 328 throw std::invalid_argument("Invalid VerbNet data directory");
1114 } 329 }
1115 330
1116 if (line.back() == '\r') 331 struct dirent* ent;
1117 { 332 while ((ent = readdir(dir)) != nullptr)
1118 line.pop_back();
1119 }
1120
1121 lines.push_back(line);
1122 }
1123
1124 progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size());
1125 for (auto line : lines)
1126 {
1127 ppgs.update();
1128
1129 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$");
1130 std::smatch relation_data;
1131 if (!std::regex_search(line, relation_data, relation))
1132 { 333 {
1133 continue; 334 std::string filename(verbNetPath_);
1134 } 335
336 if (filename.back() != '/')
337 {
338 filename += '/';
339 }
1135 340
1136 int synset_id = stoi(relation_data[1]); 341 filename += ent->d_name;
1137 int wnum = stoi(relation_data[2]);
1138 std::string word = relation_data[3];
1139 size_t word_it;
1140 while ((word_it = word.find("''")) != std::string::npos)
1141 {
1142 word.erase(word_it, 1);
1143 }
1144 342
1145 std::string query; 343 if (filename.rfind(".xml") != filename.size() - 4)
1146 switch (synset_id / 100000000)
1147 {
1148 case 1: // Noun
1149 { 344 {
1150 if (nouns.count(word) == 1) 345 continue;
1151 {
1152 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1153 } else {
1154 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1155 }
1156
1157 break;
1158 } 346 }
1159 347
1160 case 2: // Verb 348 xmlDocPtr doc = xmlParseFile(filename.c_str());
349 if (doc == nullptr)
1161 { 350 {
1162 // Ignore 351 throw std::logic_error("Error opening " + filename);
1163
1164 break;
1165 } 352 }
1166 353
1167 case 3: // Adjective 354 xmlNodePtr top = xmlDocGetRootElement(doc);
355 if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
1168 { 356 {
1169 if (adjectives.count(word) == 1) 357 throw std::logic_error("Bad VerbNet file format: " + filename);
1170 {
1171 query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)";
1172 } else {
1173 query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)";
1174 }
1175
1176 break;
1177 } 358 }
1178 359
1179 case 4: // Adverb 360 try
1180 { 361 {
1181 if (adjectives.count(word) == 1) 362 createGroup(top);
1182 { 363 } catch (const std::exception& e)
1183 query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; 364 {
1184 } else { 365 std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename));
1185 query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)";
1186 }
1187
1188 break;
1189 } 366 }
1190 } 367 }
368
369 closedir(dir);
370 }
1191 371
1192 sqlite3_stmt* ppstmt; 372 void generator::readAgidInflections()
1193 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) 373 {
374 std::list<std::string> lines(readFile(agidPath_));
375 progress ppgs("Reading inflections from AGID...", lines.size());
376
377 for (std::string line : lines)
1194 { 378 {
1195 db_error(ppdb, query); 379 ppgs.update();
1196 } 380
381 int divider = line.find_first_of(" ");
382 std::string infinitive = line.substr(0, divider);
383 line = line.substr(divider+1);
384 char type = line[0];
1197 385
1198 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); 386 if (line[1] == '?')
1199 switch (synset_id / 100000000)
1200 {
1201 case 1: // Noun
1202 { 387 {
1203 sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { 388 line.erase(0, 4);
1204 return isupper(ch); 389 } else {
1205 }) ? 1 : 0)); 390 line.erase(0, 3);
1206
1207 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1208 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1209 sqlite3_bind_int(ppstmt, 5, synset_id);
1210
1211 if (nouns.count(word) == 1)
1212 {
1213 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT);
1214 }
1215
1216 break;
1217 } 391 }
1218 392
1219 case 3: // Adjective 393 if (!lemmaByBaseForm_.count(infinitive) && (type != 'V'))
1220 case 4: // Adverb
1221 { 394 {
1222 sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); 395 continue;
1223 396 }
1224 if (adjectives.count(word) == 1) 397
398 lemma& curLemma = lookupOrCreateLemma(infinitive);
399
400 auto forms = split<std::vector<std::string>>(line, " | ");
401 for (std::string& inflForm : forms)
402 {
403 int sympos = inflForm.find_first_of(",?");
404 if (sympos != std::string::npos)
1225 { 405 {
1226 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); 406 inflForm = inflForm.substr(0, sympos);
1227 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT);
1228 } 407 }
1229
1230 break;
1231 } 408 }
1232 }
1233 409
1234 if (sqlite3_step(ppstmt) != SQLITE_DONE) 410 switch (type)
1235 {
1236 db_error(ppdb, query);
1237 }
1238
1239 sqlite3_finalize(ppstmt);
1240
1241 query = "SELECT last_insert_rowid()";
1242 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1243 {
1244 db_error(ppdb, query);
1245 }
1246
1247 if (sqlite3_step(ppstmt) != SQLITE_ROW)
1248 {
1249 db_error(ppdb, query);
1250 }
1251
1252 int rowid = sqlite3_column_int(ppstmt, 0);
1253 wn[synset_id][wnum] = rowid;
1254
1255 sqlite3_finalize(ppstmt);
1256
1257 std::string canonical(word);
1258 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1259 if (pronunciations.count(canonical) == 1)
1260 {
1261 for (auto pronunciation : pronunciations[canonical])
1262 { 411 {
1263 switch (synset_id / 100000000) 412 case 'V':
1264 { 413 {
1265 case 1: // Noun 414 if (forms.size() == 4)
1266 { 415 {
1267 if (!pronunciation.rhyme.empty()) 416 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1268 { 417 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1]));
1269 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 418 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2]));
1270 } else { 419 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3]));
1271 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 420 } else if (forms.size() == 3)
1272 }
1273
1274 break;
1275 }
1276
1277 case 3: // Adjective
1278 { 421 {
1279 if (!pronunciation.rhyme.empty()) 422 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1280 { 423 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0]));
1281 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 424 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1]));
1282 } else { 425 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2]));
1283 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 426 } else if (forms.size() == 8)
1284 } 427 {
1285 428 // As of AGID 2014.08.11, this is only "to be"
1286 break; 429 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
430 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2]));
431 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3]));
432 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4]));
433 } else {
434 // Words that don't fit the cases above as of AGID 2014.08.11:
435 // - may and shall do not conjugate the way we want them to
436 // - methinks only has a past tense and is an outlier
437 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
438 std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1287 } 439 }
1288 440
1289 case 4: // Adverb 441 // For verbs in particular, we sometimes create a notion and a word
442 // from inflection data. Specifically, if there are not yet any
443 // verbs existing that have the same infinitive form. "Yet" means
444 // that this verb appears in the AGID data but not in either WordNet
445 // or VerbNet.
446 if (!wordsByBaseForm_.count(infinitive)
447 || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) {
448 return w->getNotion().getPartOfSpeech() == part_of_speech::verb;
449 }))
1290 { 450 {
1291 if (!pronunciation.rhyme.empty()) 451 notion& n = createNotion(part_of_speech::verb);
1292 { 452 createWord(n, curLemma);
1293 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
1294 } else {
1295 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
1296 }
1297
1298 break;
1299 } 453 }
1300 }
1301
1302 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1303 {
1304 db_error(ppdb, query);
1305 }
1306
1307 sqlite3_bind_int(ppstmt, 1, rowid);
1308 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
1309 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
1310 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
1311
1312 if (!pronunciation.rhyme.empty())
1313 {
1314 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
1315 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
1316 }
1317 454
1318 if (sqlite3_step(ppstmt) != SQLITE_DONE) 455 break;
1319 {
1320 db_error(ppdb, query);
1321 } 456 }
1322
1323 sqlite3_finalize(ppstmt);
1324 }
1325 }
1326 }
1327 }
1328
1329 // While we're working on s
1330 {
1331 progress ppgs("Writing word synonyms...", wn.size());
1332 for (auto sense : wn)
1333 {
1334 ppgs.update();
1335 457
1336 for (auto word1 : sense.second) 458 case 'A':
1337 {
1338 for (auto word2 : sense.second)
1339 {
1340 if (word1 != word2)
1341 { 459 {
1342 std::string query; 460 if (forms.size() == 2)
1343 switch (sense.first / 100000000)
1344 { 461 {
1345 case 1: // Noun 462 curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0]));
1346 { 463 curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1]));
1347 query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; 464 } else {
1348 465 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
1349 break; 466 std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1350 } 467 }
1351
1352 case 2: // Verb
1353 {
1354 // Ignore
1355
1356 break;
1357 }
1358
1359 case 3: // Adjective
1360 {
1361 query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1362 468
1363 break; 469 break;
1364 } 470 }
1365 471
1366 case 4: // Adverb 472 case 'N':
1367 { 473 {
1368 query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; 474 if (forms.size() == 1)
1369
1370 break;
1371 }
1372 }
1373
1374 sqlite3_stmt* ppstmt;
1375 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1376 {
1377 db_error(ppdb, query);
1378 }
1379
1380 sqlite3_bind_int(ppstmt, 1, word1.second);
1381 sqlite3_bind_int(ppstmt, 2, word2.second);
1382
1383 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1384 { 475 {
1385 db_error(ppdb, query); 476 curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0]));
477 } else {
478 // As of AGID 2014.08.11, this is non-existent.
479 std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1386 } 480 }
1387 481
1388 sqlite3_finalize(ppstmt); 482 break;
1389 } 483 }
1390 } 484 }
1391 } 485 }
1392 } 486 }
1393 }
1394
1395 // ant table
1396 {
1397 std::ifstream wnantfile(wnpref + "wn_ant.pl");
1398 if (!wnantfile.is_open())
1399 {
1400 std::cout << "Invalid WordNet data directory." << std::endl;
1401 print_usage();
1402 }
1403
1404 std::list<std::string> lines;
1405 for (;;)
1406 {
1407 std::string line;
1408 if (!getline(wnantfile, line))
1409 {
1410 break;
1411 }
1412 487
1413 if (line.back() == '\r') 488 void generator::readPrepositions()
1414 {
1415 line.pop_back();
1416 }
1417
1418 lines.push_back(line);
1419 }
1420
1421 progress ppgs("Writing antonyms...", lines.size());
1422 for (auto line : lines)
1423 { 489 {
1424 ppgs.update(); 490 std::list<std::string> lines(readFile("prepositions.txt"));
491 progress ppgs("Reading prepositions...", lines.size());
1425 492
1426 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); 493 for (std::string line : lines)
1427 std::smatch relation_data;
1428 if (!std::regex_search(line, relation_data, relation))
1429 {
1430 continue;
1431 }
1432
1433 int synset_id_1 = stoi(relation_data[1]);
1434 int wnum_1 = stoi(relation_data[2]);
1435 int synset_id_2 = stoi(relation_data[3]);
1436 int wnum_2 = stoi(relation_data[4]);
1437
1438 std::string query;
1439 switch (synset_id_1 / 100000000)
1440 { 494 {
1441 case 1: // Noun 495 ppgs.update();
1442 {
1443 query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
1444 496
1445 break; 497 std::regex relation("^([^:]+): (.+)");
1446 } 498 std::smatch relation_data;
1447 499 std::regex_search(line, relation_data, relation);
1448 case 2: // Verb 500 std::string prep = relation_data[1];
1449 { 501 auto groups = split<std::list<std::string>>(relation_data[2], ", ");
1450 // Ignore
1451 502
1452 break; 503 notion& n = createNotion(part_of_speech::preposition);
1453 } 504 lemma& l = lookupOrCreateLemma(prep);
1454 505 word& w = createWord(n, l);
1455 case 3: // Adjective
1456 {
1457 query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1458 506
1459 break; 507 n.setPrepositionGroups(groups);
1460 }
1461
1462 case 4: // Adverb
1463 {
1464 query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1465
1466 break;
1467 }
1468 }
1469
1470 sqlite3_stmt* ppstmt;
1471 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1472 {
1473 db_error(ppdb, query);
1474 }
1475
1476 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1477 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1478
1479 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1480 {
1481 db_error(ppdb, query);
1482 }
1483
1484 sqlite3_finalize(ppstmt);
1485 }
1486 }
1487
1488 // at table
1489 {
1490 std::ifstream wnatfile(wnpref + "wn_at.pl");
1491 if (!wnatfile.is_open())
1492 {
1493 std::cout << "Invalid WordNet data directory." << std::endl;
1494 print_usage();
1495 }
1496
1497 std::list<std::string> lines;
1498 for (;;)
1499 {
1500 std::string line;
1501 if (!getline(wnatfile, line))
1502 {
1503 break;
1504 } 508 }
1505
1506 if (line.back() == '\r')
1507 {
1508 line.pop_back();
1509 }
1510
1511 lines.push_back(line);
1512 } 509 }
1513 510
1514 progress ppgs("Writing variations...", lines.size()); 511 void generator::readCmudictPronunciations()
1515 for (auto line : lines)
1516 { 512 {
1517 ppgs.update(); 513 std::list<std::string> lines(readFile(cmudictPath_));
514 progress ppgs("Reading pronunciations from CMUDICT...", lines.size());
1518 515
1519 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); 516 for (std::string line : lines)
1520 std::smatch relation_data;
1521 if (!std::regex_search(line, relation_data, relation))
1522 { 517 {
1523 continue; 518 ppgs.update();
1524 } 519
1525 520 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
1526 int synset_id_1 = stoi(relation_data[1]); 521 std::smatch phoneme_data;
1527 int synset_id_2 = stoi(relation_data[2]); 522 if (std::regex_search(line, phoneme_data, phoneme))
1528 std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)");
1529
1530 for (auto mapping1 : wn[synset_id_1])
1531 {
1532 for (auto mapping2 : wn[synset_id_2])
1533 { 523 {
1534 sqlite3_stmt* ppstmt; 524 std::string canonical(phoneme_data[1]);
1535 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 525 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1536 {
1537 db_error(ppdb, query);
1538 }
1539
1540 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1541 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1542 526
1543 if (sqlite3_step(ppstmt) != SQLITE_DONE) 527 if (!formByText_.count(canonical))
1544 { 528 {
1545 db_error(ppdb, query); 529 continue;
1546 } 530 }
1547 531
1548 sqlite3_finalize(ppstmt); 532 std::string phonemes = phoneme_data[2];
533 pronunciations_.emplace_back(phonemes);
534 pronunciation& p = pronunciations_.back();
535 formByText_.at(canonical)->addPronunciation(p);
1549 } 536 }
1550 } 537 }
1551 } 538 }
1552 }
1553
1554 // der table
1555 {
1556 std::ifstream wnderfile(wnpref + "wn_der.pl");
1557 if (!wnderfile.is_open())
1558 {
1559 std::cout << "Invalid WordNet data directory." << std::endl;
1560 print_usage();
1561 }
1562 539
1563 std::list<std::string> lines; 540 void generator::writeSchema()
1564 for (;;)
1565 { 541 {
1566 std::string line; 542 std::ifstream file("schema.sql");
1567 if (!getline(wnderfile, line)) 543 if (!file)
1568 { 544 {
1569 break; 545 throw std::invalid_argument("Could not find database schema");
1570 } 546 }
1571 547
1572 if (line.back() == '\r') 548 std::ostringstream schemaBuilder;
549 std::string line;
550 while (std::getline(file, line))
1573 { 551 {
1574 line.pop_back(); 552 if (line.back() == '\r')
553 {
554 line.pop_back();
555 }
556
557 schemaBuilder << line;
1575 } 558 }
1576 559
1577 lines.push_back(line); 560 std::string schema = schemaBuilder.str();
561 auto queries = split<std::list<std::string>>(schema, ";");
562 progress ppgs("Writing database schema...", queries.size());
563 for (std::string query : queries)
564 {
565 if (!queries.empty())
566 {
567 db_.runQuery(query);
568 }
569
570 ppgs.update();
571 }
1578 } 572 }
1579 573
1580 progress ppgs("Writing morphological derivation...", lines.size()); 574 void generator::dumpObjects()
1581 for (auto line : lines)
1582 { 575 {
1583 ppgs.update();
1584
1585 std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1586 std::smatch relation_data;
1587 if (!std::regex_search(line, relation_data, relation))
1588 { 576 {
1589 continue; 577 progress ppgs("Writing notions...", notions_.size());
578
579 for (notion& n : notions_)
580 {
581 db_ << n;
582
583 ppgs.update();
584 }
1590 } 585 }
1591 586
1592 int synset_id_1 = stoi(relation_data[1]);
1593 int wnum_1 = stoi(relation_data[2]);
1594 int synset_id_2 = stoi(relation_data[3]);
1595 int wnum_2 = stoi(relation_data[4]);
1596 std::string query;
1597 switch (synset_id_1 / 100000000)
1598 { 587 {
1599 case 1: // Noun 588 progress ppgs("Writing words...", words_.size());
589
590 for (word& w : words_)
1600 { 591 {
1601 switch (synset_id_2 / 100000000) 592 db_ << w;
1602 {
1603 case 1: // Noun
1604 {
1605 query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)";
1606 break;
1607 }
1608
1609 case 3: // Adjective
1610 {
1611 query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)";
1612 break;
1613 }
1614
1615 case 4: // Adverb
1616 {
1617 query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)";
1618 break;
1619 }
1620 }
1621 593
1622 break; 594 ppgs.update();
1623 } 595 }
596 }
597
598 {
599 progress ppgs("Writing lemmas...", lemmas_.size());
1624 600
1625 case 3: // Adjective 601 for (lemma& l : lemmas_)
1626 { 602 {
1627 switch (synset_id_2 / 100000000) 603 db_ << l;
1628 {
1629 case 1: // Noun
1630 {
1631 query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)";
1632 break;
1633 }
1634
1635 case 3: // Adjective
1636 {
1637 query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)";
1638 break;
1639 }
1640
1641 case 4: // Adverb
1642 {
1643 query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)";
1644 break;
1645 }
1646 }
1647 604
1648 break; 605 ppgs.update();
1649 } 606 }
607 }
608
609 {
610 progress ppgs("Writing forms...", forms_.size());
1650 611
1651 case 4: // Adverb 612 for (form& f : forms_)
1652 { 613 {
1653 switch (synset_id_2 / 100000000) 614 db_ << f;
1654 {
1655 case 1: // Noun
1656 {
1657 query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)";
1658 break;
1659 }
1660
1661 case 3: // Adjective
1662 {
1663 query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)";
1664 break;
1665 }
1666
1667 case 4: // Adverb
1668 {
1669 query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1670 break;
1671 }
1672 }
1673 615
1674 break; 616 ppgs.update();
1675 } 617 }
1676 } 618 }
1677 619
1678 sqlite3_stmt* ppstmt;
1679 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1680 { 620 {
1681 db_error(ppdb, query); 621 progress ppgs("Writing pronunciations...", pronunciations_.size());
622
623 for (pronunciation& p : pronunciations_)
624 {
625 db_ << p;
626
627 ppgs.update();
628 }
1682 } 629 }
1683 630
1684 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1685 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1686
1687 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1688 { 631 {
1689 db_error(ppdb, query); 632 progress ppgs("Writing verb groups...", groups_.size());
633
634 for (group& g : groups_)
635 {
636 db_ << g;
637
638 ppgs.update();
639 }
1690 } 640 }
1691 641
1692 sqlite3_finalize(ppstmt);
1693 }
1694 }
1695
1696 // hyp table
1697 {
1698 std::ifstream wnhypfile(wnpref + "wn_hyp.pl");
1699 if (!wnhypfile.is_open())
1700 {
1701 std::cout << "Invalid WordNet data directory." << std::endl;
1702 print_usage();
1703 }
1704
1705 std::list<std::string> lines;
1706 for (;;)
1707 {
1708 std::string line;
1709 if (!getline(wnhypfile, line))
1710 {
1711 break;
1712 }
1713
1714 if (line.back() == '\r')
1715 { 642 {
1716 line.pop_back(); 643 progress ppgs("Writing verb frames...", frames_.size());
644
645 for (frame& f : frames_)
646 {
647 db_ << f;
648
649 ppgs.update();
650 }
1717 } 651 }
1718
1719 lines.push_back(line);
1720 } 652 }
1721 653
1722 progress ppgs("Writing hypernyms...", lines.size()); 654 void generator::readWordNetAntonymy()
1723 for (auto line : lines)
1724 { 655 {
1725 ppgs.update(); 656 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
1726 657 progress ppgs("Writing antonyms...", lines.size());
1727 std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); 658 for (auto line : lines)
1728 std::smatch relation_data;
1729 if (!std::regex_search(line, relation_data, relation))
1730 { 659 {
1731 continue; 660 ppgs.update();
1732 }
1733
1734 int synset_id_1 = stoi(relation_data[1]);
1735 int synset_id_2 = stoi(relation_data[2]);
1736 std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)");
1737 661
1738 for (auto mapping1 : wn[synset_id_1]) 662 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1739 { 663 std::smatch relation_data;
1740 for (auto mapping2 : wn[synset_id_2]) 664 if (!std::regex_search(line, relation_data, relation))
1741 { 665 {
1742 sqlite3_stmt* ppstmt; 666 continue;
1743 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 667 }
1744 { 668
1745 db_error(ppdb, query); 669 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1746 } 670 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1747 671
1748 sqlite3_bind_int(ppstmt, 1, mapping1.second); 672 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1749 sqlite3_bind_int(ppstmt, 2, mapping2.second); 673 {
674 word& word1 = *wordByWnidAndWnum_.at(lookup1);
675 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1750 676
1751 if (sqlite3_step(ppstmt) != SQLITE_DONE) 677 std::list<field> fields;
1752 { 678 fields.emplace_back("antonym_1_id", word1.getId());
1753 db_error(ppdb, query); 679 fields.emplace_back("antonym_2_id", word2.getId());
1754 }
1755 680
1756 sqlite3_finalize(ppstmt); 681 db_.insertIntoTable("antonymy", std::move(fields));
1757 } 682 }
1758 } 683 }
1759 } 684 }
1760 }
1761
1762 // ins table
1763 {
1764 std::ifstream wninsfile(wnpref + "wn_ins.pl");
1765 if (!wninsfile.is_open())
1766 {
1767 std::cout << "Invalid WordNet data directory." << std::endl;
1768 print_usage();
1769 }
1770
1771 std::list<std::string> lines;
1772 for (;;)
1773 {
1774 std::string line;
1775 if (!getline(wninsfile, line))
1776 {
1777 break;
1778 }
1779 685
1780 if (line.back() == '\r') 686 void generator::readWordNetVariation()
687 {
688 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
689 progress ppgs("Writing variation...", lines.size());
690 for (auto line : lines)
1781 { 691 {
1782 line.pop_back(); 692 ppgs.update();
1783 }
1784 693
1785 lines.push_back(line); 694 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
695 std::smatch relation_data;
696 if (!std::regex_search(line, relation_data, relation))
697 {
698 continue;
699 }
700
701 int lookup1 = std::stoi(relation_data[1]);
702 int lookup2 = std::stoi(relation_data[2]);
703
704 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
705 {
706 notion& notion1 = *notionByWnid_.at(lookup1);
707 notion& notion2 = *notionByWnid_.at(lookup2);
708
709 std::list<field> fields;
710 fields.emplace_back("noun_id", notion1.getId());
711 fields.emplace_back("adjective_id", notion2.getId());
712
713 db_.insertIntoTable("variation", std::move(fields));
714 }
715 }
1786 } 716 }
1787 717
1788 progress ppgs("Writing instantiations...", lines.size()); 718 void generator::readWordNetClasses()
1789 for (auto line : lines)
1790 { 719 {
1791 ppgs.update(); 720 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
1792 721 progress ppgs("Writing usage, topicality, and regionality...", lines.size());
1793 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); 722 for (auto line : lines)
1794 std::smatch relation_data;
1795 if (!std::regex_search(line, relation_data, relation))
1796 { 723 {
1797 continue; 724 ppgs.update();
1798 }
1799
1800 int synset_id_1 = stoi(relation_data[1]);
1801 int synset_id_2 = stoi(relation_data[2]);
1802 std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)");
1803 725
1804 for (auto mapping1 : wn[synset_id_1]) 726 std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
1805 { 727 std::smatch relation_data;
1806 for (auto mapping2 : wn[synset_id_2]) 728 if (!std::regex_search(line, relation_data, relation))
729 {
730 continue;
731 }
732
733 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
734 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
735 std::string class_type = relation_data[5];
736
737 std::string table_name;
738 if (class_type == "t")
739 {
740 table_name += "topicality";
741 } else if (class_type == "u")
742 {
743 table_name += "usage";
744 } else if (class_type == "r")
745 {
746 table_name += "regionality";
747 }
748
749 std::list<int> leftJoin;
750 std::list<int> rightJoin;
751
752 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
1807 { 753 {
1808 sqlite3_stmt* ppstmt; 754 std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) {
1809 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 755 return w->getId();
756 });
757 } else if (wordByWnidAndWnum_.count(lookup1)) {
758 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
759 }
760
761 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
762 {
763 std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) {
764 return w->getId();
765 });
766 } else if (wordByWnidAndWnum_.count(lookup2)) {
767 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
768 }
769
770 for (int word1 : leftJoin)
771 {
772 for (int word2 : rightJoin)
1810 { 773 {
1811 db_error(ppdb, query); 774 std::list<field> fields;
1812 } 775 fields.emplace_back("term_id", word1);
776 fields.emplace_back("domain_id", word2);
1813 777
1814 sqlite3_bind_int(ppstmt, 1, mapping1.second); 778 db_.insertIntoTable(table_name, std::move(fields));
1815 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1816
1817 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1818 {
1819 db_error(ppdb, query);
1820 } 779 }
1821
1822 sqlite3_finalize(ppstmt);
1823 } 780 }
1824 } 781 }
1825 } 782 }
1826 }
1827
1828 // mm table
1829 {
1830 std::ifstream wnmmfile(wnpref + "wn_mm.pl");
1831 if (!wnmmfile.is_open())
1832 {
1833 std::cout << "Invalid WordNet data directory." << std::endl;
1834 print_usage();
1835 }
1836
1837 std::list<std::string> lines;
1838 for (;;)
1839 {
1840 std::string line;
1841 if (!getline(wnmmfile, line))
1842 {
1843 break;
1844 }
1845 783
1846 if (line.back() == '\r') 784 void generator::readWordNetCausality()
785 {
786 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
787 progress ppgs("Writing causality...", lines.size());
788 for (auto line : lines)
1847 { 789 {
1848 line.pop_back(); 790 ppgs.update();
1849 }
1850 791
1851 lines.push_back(line); 792 std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\.");
793 std::smatch relation_data;
794 if (!std::regex_search(line, relation_data, relation))
795 {
796 continue;
797 }
798
799 int lookup1 = std::stoi(relation_data[1]);
800 int lookup2 = std::stoi(relation_data[2]);
801
802 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
803 {
804 notion& notion1 = *notionByWnid_.at(lookup1);
805 notion& notion2 = *notionByWnid_.at(lookup2);
806
807 std::list<field> fields;
808 fields.emplace_back("effect_id", notion1.getId());
809 fields.emplace_back("cause_id", notion2.getId());
810
811 db_.insertIntoTable("causality", std::move(fields));
812 }
813 }
1852 } 814 }
1853 815
1854 progress ppgs("Writing member meronyms...", lines.size()); 816 void generator::readWordNetEntailment()
1855 for (auto line : lines)
1856 { 817 {
1857 ppgs.update(); 818 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
1858 819 progress ppgs("Writing entailment...", lines.size());
1859 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); 820 for (auto line : lines)
1860 std::smatch relation_data;
1861 if (!std::regex_search(line, relation_data, relation))
1862 { 821 {
1863 continue; 822 ppgs.update();
1864 }
1865 823
1866 int synset_id_1 = stoi(relation_data[1]); 824 std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\.");
1867 int synset_id_2 = stoi(relation_data[2]); 825 std::smatch relation_data;
1868 std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); 826 if (!std::regex_search(line, relation_data, relation))
1869
1870 for (auto mapping1 : wn[synset_id_1])
1871 {
1872 for (auto mapping2 : wn[synset_id_2])
1873 { 827 {
1874 sqlite3_stmt* ppstmt; 828 continue;
1875 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 829 }
1876 { 830
1877 db_error(ppdb, query); 831 int lookup1 = std::stoi(relation_data[1]);
1878 } 832 int lookup2 = std::stoi(relation_data[2]);
1879 833
1880 sqlite3_bind_int(ppstmt, 1, mapping1.second); 834 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
1881 sqlite3_bind_int(ppstmt, 2, mapping2.second); 835 {
836 notion& notion1 = *notionByWnid_.at(lookup1);
837 notion& notion2 = *notionByWnid_.at(lookup2);
1882 838
1883 if (sqlite3_step(ppstmt) != SQLITE_DONE) 839 std::list<field> fields;
1884 { 840 fields.emplace_back("given_id", notion1.getId());
1885 db_error(ppdb, query); 841 fields.emplace_back("entailment_id", notion2.getId());
1886 }
1887 842
1888 sqlite3_finalize(ppstmt); 843 db_.insertIntoTable("entailment", std::move(fields));
1889 } 844 }
1890 } 845 }
1891 } 846 }
1892 } 847
1893 848 void generator::readWordNetHypernymy()
1894 // ms table
1895 {
1896 std::ifstream wnmsfile(wnpref + "wn_ms.pl");
1897 if (!wnmsfile.is_open())
1898 {
1899 std::cout << "Invalid WordNet data directory." << std::endl;
1900 print_usage();
1901 }
1902
1903 std::list<std::string> lines;
1904 for (;;)
1905 { 849 {
1906 std::string line; 850 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
1907 if (!getline(wnmsfile, line)) 851 progress ppgs("Writing hypernymy...", lines.size());
852 for (auto line : lines)
1908 { 853 {
1909 break; 854 ppgs.update();
855
856 std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\.");
857 std::smatch relation_data;
858 if (!std::regex_search(line, relation_data, relation))
859 {
860 continue;
861 }
862
863 int lookup1 = std::stoi(relation_data[1]);
864 int lookup2 = std::stoi(relation_data[2]);
865
866 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
867 {
868 notion& notion1 = *notionByWnid_.at(lookup1);
869 notion& notion2 = *notionByWnid_.at(lookup2);
870
871 std::list<field> fields;
872 fields.emplace_back("hyponym_id", notion1.getId());
873 fields.emplace_back("hypernym_id", notion2.getId());
874
875 db_.insertIntoTable("hypernymy", std::move(fields));
876 }
1910 } 877 }
878 }
1911 879
1912 if (line.back() == '\r') 880 void generator::readWordNetInstantiation()
881 {
882 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
883 progress ppgs("Writing instantiation...", lines.size());
884 for (auto line : lines)
1913 { 885 {
1914 line.pop_back(); 886 ppgs.update();
1915 }
1916 887
1917 lines.push_back(line); 888 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
889 std::smatch relation_data;
890 if (!std::regex_search(line, relation_data, relation))
891 {
892 continue;
893 }
894
895 int lookup1 = std::stoi(relation_data[1]);
896 int lookup2 = std::stoi(relation_data[2]);
897
898 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
899 {
900 notion& notion1 = *notionByWnid_.at(lookup1);
901 notion& notion2 = *notionByWnid_.at(lookup2);
902
903 std::list<field> fields;
904 fields.emplace_back("instance_id", notion1.getId());
905 fields.emplace_back("class_id", notion2.getId());
906
907 db_.insertIntoTable("instantiation", std::move(fields));
908 }
909 }
1918 } 910 }
1919 911
1920 progress ppgs("Writing substance meronyms...", lines.size()); 912 void generator::readWordNetMemberMeronymy()
1921 for (auto line : lines)
1922 { 913 {
1923 ppgs.update(); 914 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
1924 915 progress ppgs("Writing member meronymy...", lines.size());
1925 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); 916 for (auto line : lines)
1926 std::smatch relation_data;
1927 if (!std::regex_search(line, relation_data, relation))
1928 { 917 {
1929 continue; 918 ppgs.update();
1930 }
1931
1932 int synset_id_1 = stoi(relation_data[1]);
1933 int synset_id_2 = stoi(relation_data[2]);
1934 std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1935 919
1936 for (auto mapping1 : wn[synset_id_1]) 920 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
1937 { 921 std::smatch relation_data;
1938 for (auto mapping2 : wn[synset_id_2]) 922 if (!std::regex_search(line, relation_data, relation))
1939 { 923 {
1940 sqlite3_stmt* ppstmt; 924 continue;
1941 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 925 }
1942 { 926
1943 db_error(ppdb, query); 927 int lookup1 = std::stoi(relation_data[1]);
1944 } 928 int lookup2 = std::stoi(relation_data[2]);
929
930 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
931 {
932 notion& notion1 = *notionByWnid_.at(lookup1);
933 notion& notion2 = *notionByWnid_.at(lookup2);
1945 934
1946 sqlite3_bind_int(ppstmt, 1, mapping1.second); 935 std::list<field> fields;
1947 sqlite3_bind_int(ppstmt, 2, mapping2.second); 936 fields.emplace_back("holonym_id", notion1.getId());
937 fields.emplace_back("meronym_id", notion2.getId());
1948 938
1949 if (sqlite3_step(ppstmt) != SQLITE_DONE) 939 db_.insertIntoTable("member_meronymy", std::move(fields));
1950 {
1951 db_error(ppdb, query);
1952 }
1953
1954 sqlite3_finalize(ppstmt);
1955 } 940 }
1956 } 941 }
1957 } 942 }
1958 } 943
1959 944 void generator::readWordNetPartMeronymy()
1960 // mm table
1961 {
1962 std::ifstream wnmpfile(wnpref + "wn_mp.pl");
1963 if (!wnmpfile.is_open())
1964 {
1965 std::cout << "Invalid WordNet data directory." << std::endl;
1966 print_usage();
1967 }
1968
1969 std::list<std::string> lines;
1970 for (;;)
1971 { 945 {
1972 std::string line; 946 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
1973 if (!getline(wnmpfile, line)) 947 progress ppgs("Writing part meronymy...", lines.size());
948 for (auto line : lines)
1974 { 949 {
1975 break; 950 ppgs.update();
951
952 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
953 std::smatch relation_data;
954 if (!std::regex_search(line, relation_data, relation))
955 {
956 continue;
957 }
958
959 int lookup1 = std::stoi(relation_data[1]);
960 int lookup2 = std::stoi(relation_data[2]);
961
962 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
963 {
964 notion& notion1 = *notionByWnid_.at(lookup1);
965 notion& notion2 = *notionByWnid_.at(lookup2);
966
967 std::list<field> fields;
968 fields.emplace_back("holonym_id", notion1.getId());
969 fields.emplace_back("meronym_id", notion2.getId());
970
971 db_.insertIntoTable("part_meronymy", std::move(fields));
972 }
1976 } 973 }
974 }
1977 975
1978 if (line.back() == '\r') 976 void generator::readWordNetSubstanceMeronymy()
977 {
978 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
979 progress ppgs("Writing substance meronymy...", lines.size());
980 for (auto line : lines)
1979 { 981 {
1980 line.pop_back(); 982 ppgs.update();
1981 }
1982 983
1983 lines.push_back(line); 984 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
985 std::smatch relation_data;
986 if (!std::regex_search(line, relation_data, relation))
987 {
988 continue;
989 }
990
991 int lookup1 = std::stoi(relation_data[1]);
992 int lookup2 = std::stoi(relation_data[2]);
993
994 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
995 {
996 notion& notion1 = *notionByWnid_.at(lookup1);
997 notion& notion2 = *notionByWnid_.at(lookup2);
998
999 std::list<field> fields;
1000 fields.emplace_back("holonym_id", notion1.getId());
1001 fields.emplace_back("meronym_id", notion2.getId());
1002
1003 db_.insertIntoTable("substance_meronymy", std::move(fields));
1004 }
1005 }
1984 } 1006 }
1985 1007
1986 progress ppgs("Writing part meronyms...", lines.size()); 1008 void generator::readWordNetPertainymy()
1987 for (auto line : lines)
1988 { 1009 {
1989 ppgs.update(); 1010 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
1990 1011 progress ppgs("Writing pertainymy and mannernymy...", lines.size());
1991 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); 1012 for (auto line : lines)
1992 std::smatch relation_data;
1993 if (!std::regex_search(line, relation_data, relation))
1994 { 1013 {
1995 continue; 1014 ppgs.update();
1996 }
1997
1998 int synset_id_1 = stoi(relation_data[1]);
1999 int synset_id_2 = stoi(relation_data[2]);
2000 std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
2001 1015
2002 for (auto mapping1 : wn[synset_id_1]) 1016 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
2003 { 1017 std::smatch relation_data;
2004 for (auto mapping2 : wn[synset_id_2]) 1018 if (!std::regex_search(line, relation_data, relation))
2005 { 1019 {
2006 sqlite3_stmt* ppstmt; 1020 continue;
2007 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 1021 }
2008 { 1022
2009 db_error(ppdb, query); 1023 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
2010 } 1024 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1025
1026 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1027 {
1028 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1029 word& word2 = *wordByWnidAndWnum_.at(lookup2);
2011 1030
2012 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1031 if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective)
2013 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1032 {
1033 std::list<field> fields;
1034 fields.emplace_back("pertainym_id", word1.getId());
1035 fields.emplace_back("noun_id", word2.getId());
2014 1036
2015 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1037 db_.insertIntoTable("pertainymy", std::move(fields));
1038 } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb)
2016 { 1039 {
2017 db_error(ppdb, query); 1040 std::list<field> fields;
2018 } 1041 fields.emplace_back("mannernym_id", word1.getId());
1042 fields.emplace_back("adjective_id", word2.getId());
2019 1043
2020 sqlite3_finalize(ppstmt); 1044 db_.insertIntoTable("mannernymy", std::move(fields));
1045 }
2021 } 1046 }
2022 } 1047 }
2023 } 1048 }
2024 }
2025
2026 // per table
2027 {
2028 std::ifstream wnperfile(wnpref + "wn_per.pl");
2029 if (!wnperfile.is_open())
2030 {
2031 std::cout << "Invalid WordNet data directory." << std::endl;
2032 print_usage();
2033 }
2034
2035 std::list<std::string> lines;
2036 for (;;)
2037 {
2038 std::string line;
2039 if (!getline(wnperfile, line))
2040 {
2041 break;
2042 }
2043 1049
2044 if (line.back() == '\r') 1050 void generator::readWordNetSpecification()
1051 {
1052 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
1053 progress ppgs("Writing specifications...", lines.size());
1054 for (auto line : lines)
2045 { 1055 {
2046 line.pop_back(); 1056 ppgs.update();
1057
1058 std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\.");
1059 std::smatch relation_data;
1060 if (!std::regex_search(line, relation_data, relation))
1061 {
1062 continue;
1063 }
1064
1065 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1066 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1067
1068 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1069 {
1070 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1071 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1072
1073 std::list<field> fields;
1074 fields.emplace_back("general_id", word1.getId());
1075 fields.emplace_back("specific_id", word2.getId());
1076
1077 db_.insertIntoTable("specification", std::move(fields));
1078 }
2047 } 1079 }
2048
2049 lines.push_back(line);
2050 } 1080 }
2051 1081
2052 progress ppgs("Writing pertainyms and mannernyms...", lines.size()); 1082 void generator::readWordNetSimilarity()
2053 for (auto line : lines)
2054 { 1083 {
2055 ppgs.update(); 1084 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
2056 1085 progress ppgs("Writing adjective similarity...", lines.size());
2057 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); 1086 for (auto line : lines)
2058 std::smatch relation_data;
2059 if (!std::regex_search(line, relation_data, relation))
2060 { 1087 {
2061 continue; 1088 ppgs.update();
2062 }
2063 1089
2064 int synset_id_1 = stoi(relation_data[1]); 1090 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
2065 int wnum_1 = stoi(relation_data[2]); 1091 std::smatch relation_data;
2066 int synset_id_2 = stoi(relation_data[3]); 1092 if (!std::regex_search(line, relation_data, relation))
2067 int wnum_2 = stoi(relation_data[4]);
2068 std::string query;
2069 switch (synset_id_1 / 100000000)
2070 {
2071 case 3: // Adjective
2072 { 1093 {
2073 // This is a pertainym, the second word should be a noun 1094 continue;
2074 // Technically it can be an adjective but we're ignoring that
2075 if (synset_id_2 / 100000000 != 1)
2076 {
2077 continue;
2078 }
2079
2080 query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)";
2081
2082 break;
2083 } 1095 }
1096
1097 int lookup1 = std::stoi(relation_data[1]);
1098 int lookup2 = std::stoi(relation_data[2]);
2084 1099
2085 case 4: // Adverb 1100 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
2086 { 1101 {
2087 // This is a mannernym, the second word should be an adjective 1102 notion& notion1 = *notionByWnid_.at(lookup1);
2088 if (synset_id_2 / 100000000 != 3) 1103 notion& notion2 = *notionByWnid_.at(lookup2);
2089 {
2090 continue;
2091 }
2092 1104
2093 query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; 1105 std::list<field> fields;
1106 fields.emplace_back("adjective_1_id", notion1.getId());
1107 fields.emplace_back("adjective_2_id", notion2.getId());
2094 1108
2095 break; 1109 db_.insertIntoTable("similarity", std::move(fields));
2096 } 1110 }
2097 } 1111 }
2098 1112 }
2099 sqlite3_stmt* ppstmt;
2100 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2101 {
2102 db_error(ppdb, query);
2103 }
2104
2105 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
2106 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
2107 1113
2108 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1114 std::list<std::string> generator::readFile(std::string path)
1115 {
1116 std::ifstream file(path);
1117 if (!file)
2109 { 1118 {
2110 db_error(ppdb, query); 1119 throw std::invalid_argument("Could not find file " + path);
2111 } 1120 }
2112
2113 sqlite3_finalize(ppstmt);
2114 }
2115 }
2116 1121
2117 // sa table 1122 std::list<std::string> lines;
2118 {
2119 std::ifstream wnsafile(wnpref + "wn_sa.pl");
2120 if (!wnsafile.is_open())
2121 {
2122 std::cout << "Invalid WordNet data directory." << std::endl;
2123 print_usage();
2124 }
2125
2126 std::list<std::string> lines;
2127 for (;;)
2128 {
2129 std::string line; 1123 std::string line;
2130 if (!getline(wnsafile, line)) 1124 while (std::getline(file, line))
2131 {
2132 break;
2133 }
2134
2135 if (line.back() == '\r')
2136 { 1125 {
2137 line.pop_back(); 1126 if (line.back() == '\r')
1127 {
1128 line.pop_back();
1129 }
1130
1131 lines.push_back(line);
2138 } 1132 }
2139 1133
2140 lines.push_back(line); 1134 return lines;
2141 } 1135 }
2142 1136
2143 progress ppgs("Writing specifications...", lines.size()); 1137 part_of_speech generator::partOfSpeechByWnid(int wnid)
2144 for (auto line : lines)
2145 { 1138 {
2146 ppgs.update(); 1139 switch (wnid / 100000000)
2147
2148 std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\.");
2149 std::smatch relation_data;
2150 if (!std::regex_search(line, relation_data, relation))
2151 {
2152 continue;
2153 }
2154
2155 int synset_id_1 = stoi(relation_data[1]);
2156 int wnum_1 = stoi(relation_data[2]);
2157 int synset_id_2 = stoi(relation_data[3]);
2158 int wnum_2 = stoi(relation_data[4]);
2159 std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)");
2160
2161 sqlite3_stmt* ppstmt;
2162 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2163 { 1140 {
2164 db_error(ppdb, query); 1141 case 1: return part_of_speech::noun;
1142 case 2: return part_of_speech::verb;
1143 case 3: return part_of_speech::adjective;
1144 case 4: return part_of_speech::adverb;
1145 default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid));
2165 } 1146 }
1147 }
2166 1148
2167 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); 1149 notion& generator::createNotion(part_of_speech partOfSpeech)
2168 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); 1150 {
1151 notions_.emplace_back(partOfSpeech);
1152
1153 return notions_.back();
1154 }
2169 1155
2170 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1156 notion& generator::lookupOrCreateNotion(int wnid)
1157 {
1158 if (!notionByWnid_.count(wnid))
2171 { 1159 {
2172 db_error(ppdb, query); 1160 notions_.emplace_back(partOfSpeechByWnid(wnid), wnid);
1161 notionByWnid_[wnid] = &notions_.back();
2173 } 1162 }
2174 1163
2175 sqlite3_finalize(ppstmt); 1164 return *notionByWnid_.at(wnid);
2176 }
2177 }
2178
2179 // sim table
2180 {
2181 std::ifstream wnsimfile(wnpref + "wn_sim.pl");
2182 if (!wnsimfile.is_open())
2183 {
2184 std::cout << "Invalid WordNet data directory." << std::endl;
2185 print_usage();
2186 } 1165 }
2187 1166
2188 std::list<std::string> lines; 1167 lemma& generator::lookupOrCreateLemma(std::string base_form)
2189 for (;;)
2190 { 1168 {
2191 std::string line; 1169 if (!lemmaByBaseForm_.count(base_form))
2192 if (!getline(wnsimfile, line))
2193 { 1170 {
2194 break; 1171 lemmas_.emplace_back(lookupOrCreateForm(base_form));
1172 lemmaByBaseForm_[base_form] = &lemmas_.back();
2195 } 1173 }
1174
1175 return *lemmaByBaseForm_.at(base_form);
1176 }
2196 1177
2197 if (line.back() == '\r') 1178 form& generator::lookupOrCreateForm(std::string text)
1179 {
1180 if (!formByText_.count(text))
2198 { 1181 {
2199 line.pop_back(); 1182 forms_.emplace_back(text);
1183 formByText_[text] = &forms_.back();
2200 } 1184 }
2201 1185
2202 lines.push_back(line); 1186 return *formByText_[text];
2203 } 1187 }
2204 1188
2205 progress ppgs("Writing sense synonyms...", lines.size()); 1189 template <typename... Args> word& generator::createWord(Args&&... args)
2206 for (auto line : lines)
2207 { 1190 {
2208 ppgs.update(); 1191 words_.emplace_back(std::forward<Args>(args)...);
1192 word& w = words_.back();
2209 1193
2210 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); 1194 wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w);
2211 std::smatch relation_data; 1195
2212 if (!std::regex_search(line, relation_data, relation)) 1196 if (w.getNotion().hasWnid())
2213 { 1197 {
2214 continue; 1198 wordsByWnid_[w.getNotion().getWnid()].insert(&w);
2215 } 1199 }
2216 1200
2217 int synset_id_1 = stoi(relation_data[1]); 1201 return w;
2218 int synset_id_2 = stoi(relation_data[2]); 1202 }
2219 std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); 1203
1204 group& generator::createGroup(xmlNodePtr top)
1205 {
1206 groups_.emplace_back();
1207 group& grp = groups_.back();
2220 1208
2221 for (auto mapping1 : wn[synset_id_1]) 1209 xmlChar* key;
1210
1211 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
2222 { 1212 {
2223 for (auto mapping2 : wn[synset_id_2]) 1213 if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES")))
2224 { 1214 {
2225 sqlite3_stmt* ppstmt; 1215 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
2226 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2227 { 1216 {
2228 db_error(ppdb, query); 1217 if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS")))
1218 {
1219 try
1220 {
1221 group& subgrp = createGroup(subclass);
1222 subgrp.setParent(grp);
1223 } catch (const std::exception& e)
1224 {
1225 key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID"));
1226
1227 if (key == nullptr)
1228 {
1229 std::throw_with_nested(std::logic_error("Error parsing IDless subgroup"));
1230 } else {
1231 std::string subgroupId(reinterpret_cast<const char*>(key));
1232 xmlFree(key);
1233
1234 std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId));
1235 }
1236 }
1237 }
2229 } 1238 }
2230 1239 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS")))
2231 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1240 {
2232 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1241 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
2233
2234 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2235 { 1242 {
2236 db_error(ppdb, query); 1243 if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER")))
1244 {
1245 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn"));
1246 std::string wnSenses(reinterpret_cast<const char*>(key));
1247 xmlFree(key);
1248
1249 auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " ");
1250 if (!wnSenseKeys.empty())
1251 {
1252 std::list<std::string> tempKeys;
1253
1254 std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) {
1255 return sense + "::";
1256 });
1257
1258 std::list<std::string> filteredKeys;
1259
1260 std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) {
1261 return !wnSenseKeys_.count(sense);
1262 });
1263
1264 wnSenseKeys = std::move(filteredKeys);
1265 }
1266
1267 if (!wnSenseKeys.empty())
1268 {
1269 for (std::string sense : wnSenseKeys)
1270 {
1271 word& wordSense = *wnSenseKeys_[sense];
1272 wordSense.setVerbGroup(grp);
1273 }
1274 } else {
1275 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name"));
1276 std::string memberName(reinterpret_cast<const char*>(key));
1277 xmlFree(key);
1278
1279 notion& n = createNotion(part_of_speech::verb);
1280 lemma& l = lookupOrCreateLemma(memberName);
1281 word& w = createWord(n, l);
1282
1283 w.setVerbGroup(grp);
1284 }
1285 }
2237 } 1286 }
2238 1287 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES")))
2239 sqlite3_reset(ppstmt); 1288 {
2240 sqlite3_clear_bindings(ppstmt); 1289 for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next)
2241
2242 sqlite3_bind_int(ppstmt, 1, mapping2.second);
2243 sqlite3_bind_int(ppstmt, 2, mapping1.second);
2244
2245 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2246 { 1290 {
2247 db_error(ppdb, query); 1291 if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE")))
1292 {
1293 role r;
1294
1295 key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type"));
1296 std::string roleName = reinterpret_cast<const char*>(key);
1297 xmlFree(key);
1298
1299 for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
1300 {
1301 if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1302 {
1303 r.setSelrestrs(parseSelrestr(rolenode));
1304 }
1305 }
1306
1307 grp.addRole(roleName, std::move(r));
1308 }
2248 } 1309 }
1310 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES")))
1311 {
1312 for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next)
1313 {
1314 if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME")))
1315 {
1316 frames_.emplace_back();
1317 frame& fr = frames_.back();
2249 1318
2250 sqlite3_finalize(ppstmt); 1319 for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
1320 {
1321 if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX")))
1322 {
1323 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
1324 {
1325 if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP")))
1326 {
1327 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1328 std::string partRole = reinterpret_cast<const char*>(key);
1329 xmlFree(key);
1330
1331 selrestr partSelrestrs;
1332 std::set<std::string> partSynrestrs;
1333
1334 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1335 {
1336 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS")))
1337 {
1338 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1339 {
1340 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR")))
1341 {
1342 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1343 partSynrestrs.insert(reinterpret_cast<const char*>(key));
1344 xmlFree(key);
1345 }
1346 }
1347 }
1348
1349 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1350 {
1351 partSelrestrs = parseSelrestr(npnode);
1352 }
1353 }
1354
1355 fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs)));
1356 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB")))
1357 {
1358 fr.push_back(part::createVerb());
1359 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP")))
1360 {
1361 std::set<std::string> partChoices;
1362 bool partLiteral;
1363
1364 if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")))
1365 {
1366 partLiteral = true;
1367
1368 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1369 std::string choicesStr = reinterpret_cast<const char*>(key);
1370 xmlFree(key);
1371
1372 split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices)));
1373 } else {
1374 partLiteral = false;
1375
1376 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1377 {
1378 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1379 {
1380 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1381 {
1382 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1383 {
1384 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1385 partChoices.insert(reinterpret_cast<const char*>(key));
1386 xmlFree(key);
1387 }
1388 }
1389 }
1390 }
1391 }
1392
1393 fr.push_back(part::createPreposition(std::move(partChoices), partLiteral));
1394 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ")))
1395 {
1396 fr.push_back(part::createAdjective());
1397 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV")))
1398 {
1399 fr.push_back(part::createAdverb());
1400 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX")))
1401 {
1402 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1403 std::string literalValue = reinterpret_cast<const char*>(key);
1404 xmlFree(key);
1405
1406 fr.push_back(part::createLiteral(literalValue));
1407 } else {
1408 continue;
1409 }
1410 }
1411
1412 grp.addFrame(fr);
1413 }
1414 }
1415 }
1416 }
2251 } 1417 }
2252 } 1418 }
2253 }
2254 }
2255
2256 // syntax table
2257 {
2258 std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl");
2259 if (!wnsyntaxfile.is_open())
2260 {
2261 std::cout << "Invalid WordNet data directory." << std::endl;
2262 print_usage();
2263 }
2264 1419
2265 std::list<std::string> lines; 1420 return grp;
2266 for (;;)
2267 {
2268 std::string line;
2269 if (!getline(wnsyntaxfile, line))
2270 {
2271 break;
2272 }
2273
2274 if (line.back() == '\r')
2275 {
2276 line.pop_back();
2277 }
2278
2279 lines.push_back(line);
2280 } 1421 }
2281 1422
2282 progress ppgs("Writing adjective syntax markers...", lines.size()); 1423 selrestr generator::parseSelrestr(xmlNodePtr top)
2283 for (auto line : lines)
2284 { 1424 {
2285 ppgs.update(); 1425 xmlChar* key;
2286 1426
2287 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); 1427 if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
2288 std::smatch relation_data;
2289 if (!std::regex_search(line, relation_data, relation))
2290 {
2291 continue;
2292 }
2293
2294 int synset_id = stoi(relation_data[1]);
2295 int wnum = stoi(relation_data[2]);
2296 std::string syn = relation_data[3];
2297 std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?");
2298
2299 sqlite3_stmt* ppstmt;
2300 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2301 { 1428 {
2302 db_error(ppdb, query); 1429 if (xmlChildElementCount(top) == 0)
2303 } 1430 {
2304 1431 return {};
2305 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); 1432 } else if (xmlChildElementCount(top) == 1)
2306 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); 1433 {
2307 1434 return parseSelrestr(xmlFirstElementChild(top));
2308 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1435 } else {
1436 bool orlogic = false;
1437 if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic")))
1438 {
1439 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic"));
1440 if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or")))
1441 {
1442 orlogic = true;
1443 }
1444
1445 xmlFree(key);
1446 }
1447
1448 std::list<selrestr> children;
1449 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next)
1450 {
1451 if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))
1452 || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1453 {
1454 children.push_back(parseSelrestr(selrestr));
1455 }
1456 }
1457
1458 return selrestr(children, orlogic);
1459 }
1460 } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
2309 { 1461 {
2310 db_error(ppdb, query); 1462 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value"));
1463 bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+");
1464 xmlFree(key);
1465
1466 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type"));
1467 std::string selRestriction = reinterpret_cast<const char*>(key);
1468 xmlFree(key);
1469
1470 return selrestr(selRestriction, selPos);
1471 } else {
1472 throw std::logic_error("Badly formatted selrestr");
2311 } 1473 }
2312
2313 sqlite3_finalize(ppstmt);
2314 } 1474 }
2315 } 1475
2316 1476 };
2317 sqlite3_close_v2(ppdb); 1477};
2318
2319 std::cout << "Done." << std::endl;
2320}