diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2017-01-16 18:02:50 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2017-01-16 18:02:50 -0500 |
commit | 6746da6edd7d9d50efe374eabbb79a3cac882d81 (patch) | |
tree | ff20917e08b08d36b9541c1371106596e7bec442 /generator/generator.cpp | |
parent | 4af7e55733098ca42f75a4ffaca1b0f6bab4dd36 (diff) | |
download | verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.gz verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.bz2 verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.zip |
Started structural rewrite
The new object structure was designed to build on the existing WordNet structure, while also adding in all of the data that we get from other sources. More information about this can be found on the project wiki. The generator has already been completely rewritten to generate a datafile that uses the new structure. In addition, a number of indexes are created, which does double the size of the datafile, but also allows for much faster lookups. Finally, the new generator is written modularly and is a lot more readable than the old one. The verbly interface to the new object structure has mostly been completed, but has not been tested fully. There is a completely new search API which utilizes a lot of operator overloading; documentation on how to use it should go up at some point. Token processing and verb frames are currently unimplemented. Source for these have been left in the repository for now.
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 3145 |
1 files changed, 1151 insertions, 1994 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,2320 +1,1477 @@ | |||
1 | #include <libxml/parser.h> | 1 | #include "generator.h" |
2 | #include <cassert> | ||
3 | #include <stdexcept> | ||
2 | #include <iostream> | 4 | #include <iostream> |
5 | #include <regex> | ||
3 | #include <dirent.h> | 6 | #include <dirent.h> |
4 | #include <set> | ||
5 | #include <map> | ||
6 | #include <string> | ||
7 | #include <vector> | ||
8 | #include <fstream> | 7 | #include <fstream> |
9 | #include <sqlite3.h> | 8 | #include "enums.h" |
10 | #include <sstream> | ||
11 | #include <regex> | ||
12 | #include <list> | ||
13 | #include <algorithm> | ||
14 | #include <json.hpp> | ||
15 | #include "progress.h" | 9 | #include "progress.h" |
10 | #include "selrestr.h" | ||
11 | #include "role.h" | ||
12 | #include "part.h" | ||
13 | #include "field.h" | ||
16 | #include "../lib/util.h" | 14 | #include "../lib/util.h" |
17 | 15 | ||
18 | using json = nlohmann::json; | 16 | namespace verbly { |
19 | 17 | namespace generator { | |
20 | struct verb_t { | ||
21 | std::string infinitive; | ||
22 | std::string past_tense; | ||
23 | std::string past_participle; | ||
24 | std::string ing_form; | ||
25 | std::string s_form; | ||
26 | int id; | ||
27 | }; | ||
28 | |||
29 | struct adjective_t { | ||
30 | std::string base; | ||
31 | std::string comparative; | ||
32 | std::string superlative; | ||
33 | }; | ||
34 | |||
35 | struct noun_t { | ||
36 | std::string singular; | ||
37 | std::string plural; | ||
38 | }; | ||
39 | |||
40 | struct selrestr_t { | ||
41 | enum class type_t { | ||
42 | singleton, | ||
43 | andlogic, | ||
44 | orlogic, | ||
45 | empty | ||
46 | }; | ||
47 | type_t type; | ||
48 | std::string restriction; | ||
49 | bool pos; | ||
50 | std::list<selrestr_t> subordinates; | ||
51 | }; | ||
52 | |||
53 | struct framepart_t { | ||
54 | enum class type_t { | ||
55 | np, | ||
56 | v, | ||
57 | pp, | ||
58 | adj, | ||
59 | adv, | ||
60 | lex | ||
61 | }; | ||
62 | type_t type; | ||
63 | std::string role; | ||
64 | selrestr_t selrestrs; | ||
65 | std::set<std::string> preprestrs; | ||
66 | std::set<std::string> synrestrs; | ||
67 | std::list<std::string> choices; | ||
68 | std::string lexval; | ||
69 | }; | ||
70 | |||
71 | struct group_t { | ||
72 | std::string id; | ||
73 | std::string parent; | ||
74 | std::set<std::string> members; | ||
75 | std::map<std::string, selrestr_t> roles; | ||
76 | std::list<std::list<framepart_t>> frames; | ||
77 | }; | ||
78 | |||
79 | struct pronunciation_t { | ||
80 | std::string phonemes; | ||
81 | std::string prerhyme; | ||
82 | std::string rhyme; | ||
83 | int syllables = 0; | ||
84 | std::string stress; | ||
85 | |||
86 | bool operator<(const pronunciation_t& other) const | ||
87 | { | ||
88 | return phonemes < other.phonemes; | ||
89 | } | ||
90 | }; | ||
91 | |||
92 | std::map<std::string, group_t> groups; | ||
93 | std::map<std::string, verb_t> verbs; | ||
94 | std::map<std::string, adjective_t> adjectives; | ||
95 | std::map<std::string, noun_t> nouns; | ||
96 | std::map<int, std::map<int, int>> wn; | ||
97 | std::map<int, int> images; | ||
98 | std::map<std::string, std::set<pronunciation_t>> pronunciations; | ||
99 | |||
100 | void print_usage() | ||
101 | { | ||
102 | std::cout << "Verbly Datafile Generator" << std::endl; | ||
103 | std::cout << "-------------------------" << std::endl; | ||
104 | std::cout << "Requires exactly six arguments." << std::endl; | ||
105 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | ||
106 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; | ||
107 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; | ||
108 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; | ||
109 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; | ||
110 | std::cout << "6. Datafile output path." << std::endl; | ||
111 | |||
112 | exit(1); | ||
113 | } | ||
114 | |||
115 | void db_error(sqlite3* ppdb, std::string query) | ||
116 | { | ||
117 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
118 | std::cout << query << std::endl; | ||
119 | sqlite3_close_v2(ppdb); | ||
120 | print_usage(); | ||
121 | } | ||
122 | |||
123 | json export_selrestrs(selrestr_t r) | ||
124 | { | ||
125 | if (r.type == selrestr_t::type_t::empty) | ||
126 | { | ||
127 | return {}; | ||
128 | } else if (r.type == selrestr_t::type_t::singleton) | ||
129 | { | ||
130 | json result; | ||
131 | result["type"] = r.restriction; | ||
132 | result["pos"] = r.pos; | ||
133 | return result; | ||
134 | } else { | ||
135 | json result; | ||
136 | if (r.type == selrestr_t::type_t::andlogic) | ||
137 | { | ||
138 | result["logic"] = "and"; | ||
139 | } else { | ||
140 | result["logic"] = "or"; | ||
141 | } | ||
142 | |||
143 | std::list<json> outlist; | ||
144 | std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); | ||
145 | result["children"] = outlist; | ||
146 | 18 | ||
147 | return result; | 19 | generator::generator( |
148 | } | 20 | std::string verbNetPath, |
149 | } | 21 | std::string agidPath, |
150 | 22 | std::string wordNetPath, | |
151 | selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) | 23 | std::string cmudictPath, |
152 | { | 24 | std::string imageNetPath, |
153 | selrestr_t r; | 25 | std::string outputPath) : |
154 | xmlChar* key; | 26 | verbNetPath_(verbNetPath), |
155 | 27 | agidPath_(agidPath), | |
156 | if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) | 28 | wordNetPath_(wordNetPath), |
157 | { | 29 | cmudictPath_(cmudictPath), |
158 | if (xmlChildElementCount(top) == 0) | 30 | imageNetPath_(imageNetPath), |
31 | db_(outputPath) | ||
159 | { | 32 | { |
160 | r.type = selrestr_t::type_t::empty; | 33 | // Ensure VerbNet directory exists |
161 | } else if (xmlChildElementCount(top) == 1) | 34 | DIR* dir; |
162 | { | 35 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
163 | r = parse_selrestrs(xmlFirstElementChild(top), filename); | ||
164 | } else { | ||
165 | r.type = selrestr_t::type_t::andlogic; | ||
166 | |||
167 | if (xmlHasProp(top, (const xmlChar*) "logic")) | ||
168 | { | 36 | { |
169 | key = xmlGetProp(top, (const xmlChar*) "logic"); | 37 | throw std::invalid_argument("Invalid VerbNet data directory"); |
170 | if (!xmlStrcmp(key, (const xmlChar*) "or")) | ||
171 | { | ||
172 | r.type = selrestr_t::type_t::orlogic; | ||
173 | } | ||
174 | xmlFree(key); | ||
175 | } | 38 | } |
176 | 39 | ||
177 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | 40 | closedir(dir); |
41 | |||
42 | // Ensure AGID infl.txt exists | ||
43 | if (!std::ifstream(agidPath_)) | ||
178 | { | 44 | { |
179 | if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) | 45 | throw std::invalid_argument("AGID infl.txt file not found"); |
180 | { | ||
181 | r.subordinates.push_back(parse_selrestrs(selrestr, filename)); | ||
182 | } | ||
183 | } | 46 | } |
184 | } | 47 | |
185 | } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) | 48 | // Add directory separator to WordNet path |
186 | { | 49 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) |
187 | r.type = selrestr_t::type_t::singleton; | ||
188 | |||
189 | key = xmlGetProp(top, (xmlChar*) "Value"); | ||
190 | r.pos = (std::string((const char*)key) == "+"); | ||
191 | xmlFree(key); | ||
192 | |||
193 | key = xmlGetProp(top, (xmlChar*) "type"); | ||
194 | r.restriction = (const char*) key; | ||
195 | xmlFree(key); | ||
196 | } else { | ||
197 | // Invalid | ||
198 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
199 | print_usage(); | ||
200 | } | ||
201 | |||
202 | return r; | ||
203 | } | ||
204 | |||
205 | group_t& parse_group(xmlNodePtr top, std::string filename) | ||
206 | { | ||
207 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | ||
208 | if (key == 0) | ||
209 | { | ||
210 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
211 | print_usage(); | ||
212 | } | ||
213 | std::string vnid = (const char*)key; | ||
214 | vnid = vnid.substr(vnid.find_first_of("-")+1); | ||
215 | xmlFree(key); | ||
216 | |||
217 | group_t g; | ||
218 | g.id = vnid; | ||
219 | |||
220 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
221 | { | ||
222 | if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) | ||
223 | { | ||
224 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) | ||
225 | { | 50 | { |
226 | if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) | 51 | wordNetPath_ += '/'; |
227 | { | ||
228 | auto& sg = parse_group(subclass, filename); | ||
229 | sg.parent = vnid; | ||
230 | |||
231 | for (auto member : sg.members) | ||
232 | { | ||
233 | g.members.insert(member); | ||
234 | } | ||
235 | |||
236 | // The schema requires that subclasses appear after role definitions, so we can do this now | ||
237 | for (auto role : g.roles) | ||
238 | { | ||
239 | if (sg.roles.count(role.first) == 0) | ||
240 | { | ||
241 | sg.roles[role.first] = role.second; | ||
242 | } | ||
243 | } | ||
244 | } | ||
245 | } | 52 | } |
246 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | 53 | |
247 | { | 54 | // Ensure WordNet tables exist |
248 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | 55 | for (std::string table : { |
56 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" | ||
57 | }) | ||
249 | { | 58 | { |
250 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | 59 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) |
251 | { | 60 | { |
252 | key = xmlGetProp(member, (xmlChar*) "name"); | 61 | throw std::invalid_argument("WordNet " + table + " table not found"); |
253 | g.members.insert((const char*)key); | ||
254 | xmlFree(key); | ||
255 | } | 62 | } |
256 | } | 63 | } |
257 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) | 64 | |
258 | { | 65 | // Ensure CMUDICT file exists |
259 | for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) | 66 | if (!std::ifstream(cmudictPath_)) |
260 | { | 67 | { |
261 | if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) | 68 | throw std::invalid_argument("CMUDICT file not found"); |
262 | { | ||
263 | selrestr_t r; | ||
264 | r.type = selrestr_t::type_t::empty; | ||
265 | |||
266 | key = xmlGetProp(role, (const xmlChar*) "type"); | ||
267 | std::string type = (const char*)key; | ||
268 | xmlFree(key); | ||
269 | |||
270 | for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
271 | { | ||
272 | if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) | ||
273 | { | ||
274 | r = parse_selrestrs(rolenode, filename); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | g.roles[type] = r; | ||
279 | } | ||
280 | } | 69 | } |
281 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | 70 | |
282 | { | 71 | // Ensure ImageNet urls.txt exists |
283 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | 72 | if (!std::ifstream(imageNetPath_)) |
284 | { | 73 | { |
285 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | 74 | throw std::invalid_argument("ImageNet urls.txt file not found"); |
286 | { | ||
287 | std::list<framepart_t> f; | ||
288 | |||
289 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | ||
290 | { | ||
291 | if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) | ||
292 | { | ||
293 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
294 | { | ||
295 | framepart_t fp; | ||
296 | |||
297 | if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) | ||
298 | { | ||
299 | fp.type = framepart_t::type_t::np; | ||
300 | |||
301 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
302 | fp.role = (const char*)key; | ||
303 | xmlFree(key); | ||
304 | |||
305 | fp.selrestrs.type = selrestr_t::type_t::empty; | ||
306 | |||
307 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
308 | { | ||
309 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) | ||
310 | { | ||
311 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
312 | { | ||
313 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) | ||
314 | { | ||
315 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
316 | fp.synrestrs.insert(std::string((const char*)key)); | ||
317 | xmlFree(key); | ||
318 | } | ||
319 | } | ||
320 | } | ||
321 | |||
322 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
323 | { | ||
324 | fp.selrestrs = parse_selrestrs(npnode, filename); | ||
325 | } | ||
326 | } | ||
327 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) | ||
328 | { | ||
329 | fp.type = framepart_t::type_t::v; | ||
330 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) | ||
331 | { | ||
332 | fp.type = framepart_t::type_t::pp; | ||
333 | |||
334 | if (xmlHasProp(syntaxnode, (xmlChar*) "value")) | ||
335 | { | ||
336 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
337 | std::string choices = (const char*)key; | ||
338 | xmlFree(key); | ||
339 | |||
340 | fp.choices = verbly::split<std::list<std::string>>(choices, " "); | ||
341 | } | ||
342 | |||
343 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
344 | { | ||
345 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
346 | { | ||
347 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
348 | { | ||
349 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) | ||
350 | { | ||
351 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
352 | fp.preprestrs.insert(std::string((const char*)key)); | ||
353 | xmlFree(key); | ||
354 | } | ||
355 | } | ||
356 | } | ||
357 | } | ||
358 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) | ||
359 | { | ||
360 | fp.type = framepart_t::type_t::adj; | ||
361 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) | ||
362 | { | ||
363 | fp.type = framepart_t::type_t::adv; | ||
364 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) | ||
365 | { | ||
366 | fp.type = framepart_t::type_t::lex; | ||
367 | |||
368 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
369 | fp.lexval = (const char*)key; | ||
370 | xmlFree(key); | ||
371 | } else { | ||
372 | continue; | ||
373 | } | ||
374 | |||
375 | f.push_back(fp); | ||
376 | } | ||
377 | |||
378 | g.frames.push_back(f); | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | } | 75 | } |
383 | } | 76 | } |
384 | } | ||
385 | |||
386 | groups[vnid] = g; | ||
387 | |||
388 | return groups[vnid]; | ||
389 | } | ||
390 | |||
391 | int main(int argc, char** argv) | ||
392 | { | ||
393 | if (argc != 7) | ||
394 | { | ||
395 | print_usage(); | ||
396 | } | ||
397 | |||
398 | // VerbNet data | ||
399 | std::cout << "Reading verb frames..." << std::endl; | ||
400 | |||
401 | DIR* dir; | ||
402 | if ((dir = opendir(argv[1])) == nullptr) | ||
403 | { | ||
404 | std::cout << "Invalid VerbNet data directory." << std::endl; | ||
405 | |||
406 | print_usage(); | ||
407 | } | ||
408 | |||
409 | struct dirent* ent; | ||
410 | while ((ent = readdir(dir)) != nullptr) | ||
411 | { | ||
412 | std::string filename(argv[1]); | ||
413 | if (filename.back() != '/') | ||
414 | { | ||
415 | filename += '/'; | ||
416 | } | ||
417 | 77 | ||
418 | filename += ent->d_name; | 78 | void generator::run() |
419 | //std::cout << ent->d_name << std::endl; | ||
420 | |||
421 | if (filename.rfind(".xml") != filename.size() - 4) | ||
422 | { | ||
423 | continue; | ||
424 | } | ||
425 | |||
426 | xmlDocPtr doc = xmlParseFile(filename.c_str()); | ||
427 | if (doc == nullptr) | ||
428 | { | ||
429 | std::cout << "Error opening " << filename << std::endl; | ||
430 | print_usage(); | ||
431 | } | ||
432 | |||
433 | xmlNodePtr top = xmlDocGetRootElement(doc); | ||
434 | if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) | ||
435 | { | ||
436 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
437 | print_usage(); | ||
438 | } | ||
439 | |||
440 | parse_group(top, filename); | ||
441 | } | ||
442 | |||
443 | closedir(dir); | ||
444 | |||
445 | // Get verbs from AGID | ||
446 | std::cout << "Reading inflections..." << std::endl; | ||
447 | |||
448 | std::ifstream agidfile(argv[2]); | ||
449 | if (!agidfile.is_open()) | ||
450 | { | ||
451 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; | ||
452 | print_usage(); | ||
453 | } | ||
454 | |||
455 | for (;;) | ||
456 | { | ||
457 | std::string line; | ||
458 | if (!getline(agidfile, line)) | ||
459 | { | ||
460 | break; | ||
461 | } | ||
462 | |||
463 | if (line.back() == '\r') | ||
464 | { | 79 | { |
465 | line.pop_back(); | 80 | // Create notions, words, lemmas, and forms from WordNet synsets |
466 | } | 81 | readWordNetSynsets(); |
467 | 82 | ||
468 | int divider = line.find_first_of(" "); | 83 | // Reads adjective positioning WordNet data |
469 | std::string word = line.substr(0, divider); | 84 | readAdjectivePositioning(); |
470 | line = line.substr(divider+1); | 85 | |
471 | char type = line[0]; | 86 | // Counts the number of URLs ImageNet has per notion |
472 | 87 | readImageNetUrls(); | |
473 | if (line[1] == '?') | 88 | |
474 | { | 89 | // Creates a word by WordNet sense key lookup table |
475 | line.erase(0, 4); | 90 | readWordNetSenseKeys(); |
476 | } else { | 91 | |
477 | line.erase(0, 3); | 92 | // Creates groups and frames from VerbNet data |
478 | } | 93 | readVerbNet(); |
479 | 94 | ||
480 | std::vector<std::string> forms; | 95 | // Creates forms and inflections from AGID. To reduce the amount of forms |
481 | while (!line.empty()) | 96 | // created, we do this after most lemmas that need inflecting have been |
482 | { | 97 | // created through other means, and then only generate forms for |
483 | std::string inflection; | 98 | // inflections of already-existing lemmas. The exception to this regards |
484 | if ((divider = line.find(" | ")) != std::string::npos) | 99 | // verb lemmas. If a verb lemma in AGID either does not exist yet, or does |
485 | { | 100 | // exist but is not related to any words that are related to verb notions, |
486 | inflection = line.substr(0, divider); | 101 | // then a notion and a word is generated and the form generation proceeds |
487 | line = line.substr(divider + 3); | 102 | // as usual. |
488 | } else { | 103 | readAgidInflections(); |
489 | inflection = line; | 104 | |
490 | line = ""; | 105 | // Reads in prepositions and the is_a relationship |
491 | } | 106 | readPrepositions(); |
492 | 107 | ||
493 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | 108 | // Creates pronunciations from CMUDICT. To reduce the amount of |
494 | { | 109 | // pronunciations created, we do this after all forms have been created, |
495 | inflection = inflection.substr(0, divider); | 110 | // and then only generate pronunciations for already-exisiting forms. |
496 | } | 111 | readCmudictPronunciations(); |
497 | 112 | ||
498 | forms.push_back(inflection); | 113 | // Writes the database schema |
114 | writeSchema(); | ||
115 | |||
116 | // Dumps data to the database | ||
117 | dumpObjects(); | ||
118 | |||
119 | // Populates the antonymy relationship from WordNet | ||
120 | readWordNetAntonymy(); | ||
121 | |||
122 | // Populates the variation relationship from WordNet | ||
123 | readWordNetVariation(); | ||
124 | |||
125 | // Populates the usage, topicality, and regionality relationships from | ||
126 | // WordNet | ||
127 | readWordNetClasses(); | ||
128 | |||
129 | // Populates the causality relationship from WordNet | ||
130 | readWordNetCausality(); | ||
131 | |||
132 | // Populates the entailment relationship from WordNet | ||
133 | readWordNetEntailment(); | ||
134 | |||
135 | // Populates the hypernymy relationship from WordNet | ||
136 | readWordNetHypernymy(); | ||
137 | |||
138 | // Populates the instantiation relationship from WordNet | ||
139 | readWordNetInstantiation(); | ||
140 | |||
141 | // Populates the member meronymy relationship from WordNet | ||
142 | readWordNetMemberMeronymy(); | ||
143 | |||
144 | // Populates the part meronymy relationship from WordNet | ||
145 | readWordNetPartMeronymy(); | ||
146 | |||
147 | // Populates the substance meronymy relationship from WordNet | ||
148 | readWordNetSubstanceMeronymy(); | ||
149 | |||
150 | // Populates the pertainymy and mannernymy relationships from WordNet | ||
151 | readWordNetPertainymy(); | ||
152 | |||
153 | // Populates the specification relationship from WordNet | ||
154 | readWordNetSpecification(); | ||
155 | |||
156 | // Populates the adjective similarity relationship from WordNet | ||
157 | readWordNetSimilarity(); | ||
158 | |||
159 | |||
160 | |||
161 | |||
162 | |||
163 | |||
164 | |||
165 | |||
499 | } | 166 | } |
500 | 167 | ||
501 | switch (type) | 168 | void generator::readWordNetSynsets() |
502 | { | 169 | { |
503 | case 'V': | 170 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
171 | progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
172 | |||
173 | for (std::string line : lines) | ||
504 | { | 174 | { |
505 | verb_t v; | 175 | ppgs.update(); |
506 | v.infinitive = word; | 176 | |
507 | if (forms.size() == 4) | 177 | std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); |
508 | { | 178 | std::smatch relation_data; |
509 | v.past_tense = forms[0]; | 179 | if (!std::regex_search(line, relation_data, relation)) |
510 | v.past_participle = forms[1]; | 180 | { |
511 | v.ing_form = forms[2]; | 181 | continue; |
512 | v.s_form = forms[3]; | ||
513 | } else if (forms.size() == 3) | ||
514 | { | ||
515 | v.past_tense = forms[0]; | ||
516 | v.past_participle = forms[0]; | ||
517 | v.ing_form = forms[1]; | ||
518 | v.s_form = forms[2]; | ||
519 | } else if (forms.size() == 8) | ||
520 | { | ||
521 | // As of AGID 2014.08.11, this is only "to be" | ||
522 | v.past_tense = forms[0]; | ||
523 | v.past_participle = forms[2]; | ||
524 | v.ing_form = forms[3]; | ||
525 | v.s_form = forms[4]; | ||
526 | } else { | ||
527 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
528 | // - may and shall do not conjugate the way we want them to | ||
529 | // - methinks only has a past tense and is an outlier | ||
530 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
531 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
532 | } | 182 | } |
533 | 183 | ||
534 | verbs[word] = v; | 184 | int synset_id = std::stoi(relation_data[1]); |
535 | 185 | int wnum = std::stoi(relation_data[2]); | |
536 | break; | 186 | std::string text = relation_data[3]; |
537 | } | 187 | int tag_count = std::stoi(relation_data[4]); |
538 | 188 | size_t word_it; | |
539 | case 'A': | 189 | while ((word_it = text.find("''")) != std::string::npos) |
540 | { | ||
541 | adjective_t adj; | ||
542 | adj.base = word; | ||
543 | if (forms.size() == 2) | ||
544 | { | 190 | { |
545 | adj.comparative = forms[0]; | 191 | text.erase(word_it, 1); |
546 | adj.superlative = forms[1]; | ||
547 | } else { | ||
548 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | ||
549 | std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
550 | } | 192 | } |
551 | 193 | ||
552 | adjectives[word] = adj; | 194 | // The WordNet data does contain duplicates, so we need to check that we |
553 | 195 | // haven't already created this word. | |
554 | break; | 196 | std::pair<int, int> lookup(synset_id, wnum); |
555 | } | 197 | if (!wordByWnidAndWnum_.count(lookup)) |
556 | |||
557 | case 'N': | ||
558 | { | ||
559 | noun_t n; | ||
560 | n.singular = word; | ||
561 | if (forms.size() == 1) | ||
562 | { | 198 | { |
563 | n.plural = forms[0]; | 199 | notion& synset = lookupOrCreateNotion(synset_id); |
564 | } else { | 200 | lemma& lex = lookupOrCreateLemma(text); |
565 | // As of AGID 2014.08.11, this is non-existent. | 201 | word& entry = createWord(synset, lex, tag_count); |
566 | std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; | 202 | |
203 | wordByWnidAndWnum_[lookup] = &entry; | ||
567 | } | 204 | } |
568 | |||
569 | nouns[word] = n; | ||
570 | |||
571 | break; | ||
572 | } | 205 | } |
573 | } | 206 | } |
574 | } | ||
575 | |||
576 | // Pronounciations | ||
577 | std::cout << "Reading pronunciations..." << std::endl; | ||
578 | |||
579 | std::ifstream pronfile(argv[4]); | ||
580 | if (!pronfile.is_open()) | ||
581 | { | ||
582 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; | ||
583 | print_usage(); | ||
584 | } | ||
585 | |||
586 | for (;;) | ||
587 | { | ||
588 | std::string line; | ||
589 | if (!getline(pronfile, line)) | ||
590 | { | ||
591 | break; | ||
592 | } | ||
593 | |||
594 | if (line.back() == '\r') | ||
595 | { | ||
596 | line.pop_back(); | ||
597 | } | ||
598 | 207 | ||
599 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | 208 | void generator::readAdjectivePositioning() |
600 | std::smatch phoneme_data; | ||
601 | if (std::regex_search(line, phoneme_data, phoneme)) | ||
602 | { | 209 | { |
603 | std::string canonical(phoneme_data[1]); | 210 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); |
604 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | 211 | progress ppgs("Reading adjective positionings from WordNet...", lines.size()); |
605 | |||
606 | std::string phonemes = phoneme_data[2]; | ||
607 | auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " "); | ||
608 | auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { | ||
609 | return phoneme.find("1") != std::string::npos; | ||
610 | }); | ||
611 | 212 | ||
612 | pronunciation_t p; | 213 | for (std::string line : lines) |
613 | p.phonemes = phonemes; | ||
614 | |||
615 | // Rhyme detection | ||
616 | if (phemstrt != std::end(phoneme_set)) | ||
617 | { | 214 | { |
618 | std::stringstream rhymer; | 215 | ppgs.update(); |
619 | for (auto it = phemstrt; it != std::end(phoneme_set); it++) | ||
620 | { | ||
621 | std::string naked; | ||
622 | std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { | ||
623 | return isdigit(ch); | ||
624 | }); | ||
625 | |||
626 | if (it != phemstrt) | ||
627 | { | ||
628 | rhymer << " "; | ||
629 | } | ||
630 | |||
631 | rhymer << naked; | ||
632 | } | ||
633 | 216 | ||
634 | p.rhyme = rhymer.str(); | 217 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); |
635 | 218 | std::smatch relation_data; | |
636 | if (phemstrt != std::begin(phoneme_set)) | 219 | if (!std::regex_search(line, relation_data, relation)) |
637 | { | 220 | { |
638 | phemstrt--; | 221 | continue; |
639 | p.prerhyme = *phemstrt; | ||
640 | } else { | ||
641 | p.prerhyme = ""; | ||
642 | } | 222 | } |
643 | } else { | ||
644 | p.prerhyme = ""; | ||
645 | p.rhyme = ""; | ||
646 | } | ||
647 | 223 | ||
648 | // Syllable/stress | 224 | int synset_id = stoi(relation_data[1]); |
649 | for (auto phm : phoneme_set) | 225 | int wnum = stoi(relation_data[2]); |
650 | { | 226 | std::string adjpos_str = relation_data[3]; |
651 | if (isdigit(phm.back())) | ||
652 | { | ||
653 | // It's a vowel! | ||
654 | p.syllables++; | ||
655 | 227 | ||
656 | if (phm.back() == '1') | 228 | std::pair<int, int> lookup(synset_id, wnum); |
229 | if (wordByWnidAndWnum_.count(lookup)) | ||
230 | { | ||
231 | word& adj = *wordByWnidAndWnum_.at(lookup); | ||
232 | |||
233 | if (adjpos_str == "p") | ||
234 | { | ||
235 | adj.setAdjectivePosition(positioning::predicate); | ||
236 | } else if (adjpos_str == "a") | ||
237 | { | ||
238 | adj.setAdjectivePosition(positioning::attributive); | ||
239 | } else if (adjpos_str == "i") | ||
657 | { | 240 | { |
658 | p.stress.push_back('1'); | 241 | adj.setAdjectivePosition(positioning::postnominal); |
659 | } else { | 242 | } else { |
660 | p.stress.push_back('0'); | 243 | // Can't happen because of how we specified the regex. |
244 | assert(false); | ||
661 | } | 245 | } |
662 | } | 246 | } |
663 | } | 247 | } |
664 | |||
665 | pronunciations[canonical].insert(p); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | // Images | ||
670 | std::cout << "Reading images..." << std::endl; | ||
671 | |||
672 | std::ifstream imagefile(argv[5]); | ||
673 | if (!imagefile.is_open()) | ||
674 | { | ||
675 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
676 | print_usage(); | ||
677 | } | ||
678 | |||
679 | for (;;) | ||
680 | { | ||
681 | std::string line; | ||
682 | if (!getline(imagefile, line)) | ||
683 | { | ||
684 | break; | ||
685 | } | ||
686 | |||
687 | if (line.back() == '\r') | ||
688 | { | ||
689 | line.pop_back(); | ||
690 | } | ||
691 | |||
692 | std::string wnid_s = line.substr(1, 8); | ||
693 | int wnid = stoi(wnid_s) + 100000000; | ||
694 | images[wnid]++; | ||
695 | } | ||
696 | |||
697 | imagefile.close(); | ||
698 | |||
699 | // Start writing output | ||
700 | std::cout << "Writing schema..." << std::endl; | ||
701 | |||
702 | sqlite3* ppdb; | ||
703 | if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
704 | { | ||
705 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | ||
706 | print_usage(); | ||
707 | } | ||
708 | |||
709 | std::ifstream schemafile("schema.sql"); | ||
710 | if (!schemafile.is_open()) | ||
711 | { | ||
712 | std::cout << "Could not find schema file" << std::endl; | ||
713 | print_usage(); | ||
714 | } | ||
715 | |||
716 | std::stringstream schemabuilder; | ||
717 | for (;;) | ||
718 | { | ||
719 | std::string line; | ||
720 | if (!getline(schemafile, line)) | ||
721 | { | ||
722 | break; | ||
723 | } | ||
724 | |||
725 | if (line.back() == '\r') | ||
726 | { | ||
727 | line.pop_back(); | ||
728 | } | ||
729 | |||
730 | schemabuilder << line << std::endl; | ||
731 | } | ||
732 | |||
733 | std::string schema = schemabuilder.str(); | ||
734 | while (!schema.empty()) | ||
735 | { | ||
736 | std::string query; | ||
737 | int divider = schema.find(";"); | ||
738 | if (divider != std::string::npos) | ||
739 | { | ||
740 | query = schema.substr(0, divider+1); | ||
741 | schema = schema.substr(divider+2); | ||
742 | } else { | ||
743 | break; | ||
744 | } | 248 | } |
745 | 249 | ||
746 | sqlite3_stmt* schmstmt; | 250 | void generator::readImageNetUrls() |
747 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | ||
748 | { | 251 | { |
749 | db_error(ppdb, query); | 252 | // The ImageNet datafile is so large that it is unreasonable and |
750 | } | 253 | // unnecessary to read it into memory; instead, we will parse each line as |
751 | 254 | // we read it. This has the caveat that we cannot display a progress bar. | |
752 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | 255 | std::cout << "Reading image counts from ImageNet..." << std::endl; |
753 | { | ||
754 | db_error(ppdb, query); | ||
755 | } | ||
756 | |||
757 | sqlite3_finalize(schmstmt); | ||
758 | } | ||
759 | |||
760 | std::cout << "Writing prepositions..." << std::endl; | ||
761 | std::ifstream prepfile("prepositions.txt"); | ||
762 | if (!prepfile.is_open()) | ||
763 | { | ||
764 | std::cout << "Could not find prepositions file" << std::endl; | ||
765 | print_usage(); | ||
766 | } | ||
767 | |||
768 | for (;;) | ||
769 | { | ||
770 | std::string line; | ||
771 | if (!getline(prepfile, line)) | ||
772 | { | ||
773 | break; | ||
774 | } | ||
775 | |||
776 | if (line.back() == '\r') | ||
777 | { | ||
778 | line.pop_back(); | ||
779 | } | ||
780 | |||
781 | std::regex relation("^([^:]+): (.+)"); | ||
782 | std::smatch relation_data; | ||
783 | std::regex_search(line, relation_data, relation); | ||
784 | std::string prep = relation_data[1]; | ||
785 | std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", "); | ||
786 | |||
787 | std::string query("INSERT INTO prepositions (form) VALUES (?)"); | ||
788 | sqlite3_stmt* ppstmt; | ||
789 | |||
790 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
791 | { | ||
792 | db_error(ppdb, query); | ||
793 | } | ||
794 | |||
795 | sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); | ||
796 | |||
797 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
798 | { | ||
799 | db_error(ppdb, query); | ||
800 | } | ||
801 | |||
802 | sqlite3_finalize(ppstmt); | ||
803 | |||
804 | query = "SELECT last_insert_rowid()"; | ||
805 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
806 | { | ||
807 | db_error(ppdb, query); | ||
808 | } | ||
809 | |||
810 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
811 | { | ||
812 | db_error(ppdb, query); | ||
813 | } | ||
814 | |||
815 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
816 | sqlite3_finalize(ppstmt); | ||
817 | |||
818 | for (auto group : groups) | ||
819 | { | ||
820 | query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; | ||
821 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
822 | { | ||
823 | db_error(ppdb, query); | ||
824 | } | ||
825 | 256 | ||
826 | sqlite3_bind_int(ppstmt, 1, rowid); | 257 | std::ifstream file(imageNetPath_); |
827 | sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); | 258 | if (!file) |
828 | |||
829 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
830 | { | 259 | { |
831 | db_error(ppdb, query); | 260 | throw std::invalid_argument("Could not find file " + imageNetPath_); |
832 | } | 261 | } |
833 | |||
834 | sqlite3_finalize(ppstmt); | ||
835 | } | ||
836 | } | ||
837 | |||
838 | 262 | ||
839 | { | 263 | std::string line; |
840 | progress ppgs("Writing verbs...", verbs.size()); | 264 | while (std::getline(file, line)) |
841 | for (auto& mapping : verbs) | ||
842 | { | ||
843 | sqlite3_stmt* ppstmt; | ||
844 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | ||
845 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
846 | { | ||
847 | db_error(ppdb, query); | ||
848 | } | ||
849 | |||
850 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); | ||
851 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); | ||
852 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); | ||
853 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); | ||
854 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); | ||
855 | |||
856 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
857 | { | ||
858 | db_error(ppdb, query); | ||
859 | } | ||
860 | |||
861 | sqlite3_finalize(ppstmt); | ||
862 | |||
863 | std::string canonical(mapping.second.infinitive); | ||
864 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
865 | if (pronunciations.count(canonical) == 1) | ||
866 | { | 265 | { |
867 | query = "SELECT last_insert_rowid()"; | 266 | if (line.back() == '\r') |
868 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
869 | { | 267 | { |
870 | db_error(ppdb, query); | 268 | line.pop_back(); |
871 | } | 269 | } |
872 | 270 | ||
873 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | 271 | std::string wnid_s = line.substr(1, 8); |
272 | int wnid = stoi(wnid_s) + 100000000; | ||
273 | if (notionByWnid_.count(wnid)) | ||
874 | { | 274 | { |
875 | db_error(ppdb, query); | 275 | // We know that this notion has a wnid and is a noun. |
876 | } | 276 | notionByWnid_.at(wnid)->incrementNumOfImages(); |
877 | |||
878 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
879 | |||
880 | sqlite3_finalize(ppstmt); | ||
881 | |||
882 | mapping.second.id = rowid; | ||
883 | |||
884 | for (auto pronunciation : pronunciations[canonical]) | ||
885 | { | ||
886 | if (!pronunciation.rhyme.empty()) | ||
887 | { | ||
888 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
889 | } else { | ||
890 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
891 | } | ||
892 | |||
893 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
894 | { | ||
895 | db_error(ppdb, query); | ||
896 | } | ||
897 | |||
898 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
899 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
900 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
901 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
902 | |||
903 | if (!pronunciation.rhyme.empty()) | ||
904 | { | ||
905 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
906 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
907 | } | ||
908 | |||
909 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
910 | { | ||
911 | db_error(ppdb, query); | ||
912 | } | ||
913 | |||
914 | sqlite3_finalize(ppstmt); | ||
915 | } | 277 | } |
916 | } | 278 | } |
917 | |||
918 | ppgs.update(); | ||
919 | } | 279 | } |
920 | } | 280 | |
921 | 281 | void generator::readWordNetSenseKeys() | |
922 | { | ||
923 | progress ppgs("Writing verb frames...", groups.size()); | ||
924 | for (auto& mapping : groups) | ||
925 | { | 282 | { |
926 | std::list<json> roledatal; | 283 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); |
927 | std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { | 284 | progress ppgs("Reading sense keys from WordNet...", lines.size()); |
928 | json role; | ||
929 | role["type"] = r.first; | ||
930 | role["selrestrs"] = export_selrestrs(r.second); | ||
931 | |||
932 | return role; | ||
933 | }); | ||
934 | |||
935 | json roledata(roledatal); | ||
936 | std::string rdm = roledata.dump(); | ||
937 | |||
938 | sqlite3_stmt* ppstmt; | ||
939 | std::string query("INSERT INTO groups (data) VALUES (?)"); | ||
940 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
941 | { | ||
942 | db_error(ppdb, query); | ||
943 | } | ||
944 | |||
945 | sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); | ||
946 | |||
947 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
948 | { | ||
949 | db_error(ppdb, query); | ||
950 | } | ||
951 | 285 | ||
952 | sqlite3_finalize(ppstmt); | 286 | for (std::string line : lines) |
953 | |||
954 | query = "SELECT last_insert_rowid()"; | ||
955 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
956 | { | ||
957 | db_error(ppdb, query); | ||
958 | } | ||
959 | |||
960 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
961 | { | ||
962 | db_error(ppdb, query); | ||
963 | } | ||
964 | |||
965 | int gid = sqlite3_column_int(ppstmt, 0); | ||
966 | sqlite3_finalize(ppstmt); | ||
967 | |||
968 | for (auto frame : mapping.second.frames) | ||
969 | { | 287 | { |
970 | std::list<json> fdatap; | 288 | ppgs.update(); |
971 | std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { | ||
972 | json part; | ||
973 | |||
974 | switch (fp.type) | ||
975 | { | ||
976 | case framepart_t::type_t::np: | ||
977 | { | ||
978 | part["type"] = "np"; | ||
979 | part["role"] = fp.role; | ||
980 | part["selrestrs"] = export_selrestrs(fp.selrestrs); | ||
981 | part["synrestrs"] = fp.synrestrs; | ||
982 | |||
983 | break; | ||
984 | } | ||
985 | |||
986 | case framepart_t::type_t::pp: | ||
987 | { | ||
988 | part["type"] = "pp"; | ||
989 | part["values"] = fp.choices; | ||
990 | part["preprestrs"] = fp.preprestrs; | ||
991 | |||
992 | break; | ||
993 | } | ||
994 | |||
995 | case framepart_t::type_t::v: | ||
996 | { | ||
997 | part["type"] = "v"; | ||
998 | |||
999 | break; | ||
1000 | } | ||
1001 | |||
1002 | case framepart_t::type_t::adj: | ||
1003 | { | ||
1004 | part["type"] = "adj"; | ||
1005 | |||
1006 | break; | ||
1007 | } | ||
1008 | |||
1009 | case framepart_t::type_t::adv: | ||
1010 | { | ||
1011 | part["type"] = "adv"; | ||
1012 | |||
1013 | break; | ||
1014 | } | ||
1015 | |||
1016 | case framepart_t::type_t::lex: | ||
1017 | { | ||
1018 | part["type"] = "lex"; | ||
1019 | part["value"] = fp.lexval; | ||
1020 | |||
1021 | break; | ||
1022 | } | ||
1023 | } | ||
1024 | |||
1025 | return part; | ||
1026 | }); | ||
1027 | |||
1028 | json fdata(fdatap); | ||
1029 | std::string marshall = fdata.dump(); | ||
1030 | |||
1031 | query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; | ||
1032 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1033 | { | ||
1034 | db_error(ppdb, query); | ||
1035 | } | ||
1036 | |||
1037 | sqlite3_bind_int(ppstmt, 1, gid); | ||
1038 | sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); | ||
1039 | 289 | ||
1040 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 290 | // We only actually need to lookup verbs by sense key so we'll just |
291 | // ignore everything that isn't a verb. | ||
292 | std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); | ||
293 | std::smatch relation_data; | ||
294 | if (!std::regex_search(line, relation_data, relation)) | ||
1041 | { | 295 | { |
1042 | db_error(ppdb, query); | 296 | continue; |
1043 | } | 297 | } |
298 | |||
299 | int synset_id = stoi(relation_data[1]); | ||
300 | int wnum = stoi(relation_data[2]); | ||
301 | std::string sense_key = relation_data[3]; | ||
1044 | 302 | ||
1045 | sqlite3_finalize(ppstmt); | 303 | // We are treating this mapping as injective, which is not entirely |
1046 | } | 304 | // accurate. First, the WordNet table contains duplicate rows, so those |
1047 | 305 | // need to be ignored. More importantly, a small number of sense keys | |
1048 | for (auto member : mapping.second.members) | 306 | // (one for each letter of the Latin alphabet, plus 9 other words) each |
1049 | { | 307 | // map to two different words in the same synset which differ only by |
1050 | if (verbs.count(member) == 1) | 308 | // capitalization. Luckily, none of these exceptions are verbs, so we |
309 | // can pretend that the mapping is injective. | ||
310 | if (!wnSenseKeys_.count(sense_key)) | ||
1051 | { | 311 | { |
1052 | auto& v = verbs[member]; | 312 | std::pair<int, int> lookup(synset_id, wnum); |
1053 | 313 | if (wordByWnidAndWnum_.count(lookup)) | |
1054 | query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; | ||
1055 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1056 | { | ||
1057 | db_error(ppdb, query); | ||
1058 | } | ||
1059 | |||
1060 | sqlite3_bind_int(ppstmt, 1, v.id); | ||
1061 | sqlite3_bind_int(ppstmt, 2, gid); | ||
1062 | |||
1063 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1064 | { | 314 | { |
1065 | db_error(ppdb, query); | 315 | wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); |
1066 | } | 316 | } |
1067 | |||
1068 | sqlite3_finalize(ppstmt); | ||
1069 | } | 317 | } |
1070 | } | 318 | } |
1071 | |||
1072 | ppgs.update(); | ||
1073 | } | 319 | } |
1074 | } | 320 | |
1075 | 321 | void generator::readVerbNet() | |
1076 | // Get nouns/adjectives/adverbs from WordNet | ||
1077 | // Useful relations: | ||
1078 | // - s: master list | ||
1079 | // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) | ||
1080 | // - at: variation (e.g. a measurement can be standard or nonstandard) | ||
1081 | // - der: derivation (e.g. happy/happily, happily/happy) | ||
1082 | // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) | ||
1083 | // - ins: instantiation (do we need this? let's see) | ||
1084 | // - mm: member meronymy/holonymy (e.g. family/mother, family/child) | ||
1085 | // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) | ||
1086 | // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) | ||
1087 | // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) | ||
1088 | // mannernymy (e.g. something done quickly is done in a manner that is quick) | ||
1089 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | ||
1090 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | ||
1091 | // - syntax: positioning flags for some adjectives | ||
1092 | std::string wnpref {argv[3]}; | ||
1093 | if (wnpref.back() != '/') | ||
1094 | { | ||
1095 | wnpref += '/'; | ||
1096 | } | ||
1097 | |||
1098 | // s table | ||
1099 | { | ||
1100 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
1101 | if (!wnsfile.is_open()) | ||
1102 | { | 322 | { |
1103 | std::cout << "Invalid WordNet data directory." << std::endl; | 323 | std::cout << "Reading frames from VerbNet..." << std::endl; |
1104 | print_usage(); | ||
1105 | } | ||
1106 | 324 | ||
1107 | std::list<std::string> lines; | 325 | DIR* dir; |
1108 | for (;;) | 326 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
1109 | { | ||
1110 | std::string line; | ||
1111 | if (!getline(wnsfile, line)) | ||
1112 | { | 327 | { |
1113 | break; | 328 | throw std::invalid_argument("Invalid VerbNet data directory"); |
1114 | } | 329 | } |
1115 | 330 | ||
1116 | if (line.back() == '\r') | 331 | struct dirent* ent; |
1117 | { | 332 | while ((ent = readdir(dir)) != nullptr) |
1118 | line.pop_back(); | ||
1119 | } | ||
1120 | |||
1121 | lines.push_back(line); | ||
1122 | } | ||
1123 | |||
1124 | progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); | ||
1125 | for (auto line : lines) | ||
1126 | { | ||
1127 | ppgs.update(); | ||
1128 | |||
1129 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); | ||
1130 | std::smatch relation_data; | ||
1131 | if (!std::regex_search(line, relation_data, relation)) | ||
1132 | { | 333 | { |
1133 | continue; | 334 | std::string filename(verbNetPath_); |
1134 | } | 335 | |
336 | if (filename.back() != '/') | ||
337 | { | ||
338 | filename += '/'; | ||
339 | } | ||
1135 | 340 | ||
1136 | int synset_id = stoi(relation_data[1]); | 341 | filename += ent->d_name; |
1137 | int wnum = stoi(relation_data[2]); | ||
1138 | std::string word = relation_data[3]; | ||
1139 | size_t word_it; | ||
1140 | while ((word_it = word.find("''")) != std::string::npos) | ||
1141 | { | ||
1142 | word.erase(word_it, 1); | ||
1143 | } | ||
1144 | 342 | ||
1145 | std::string query; | 343 | if (filename.rfind(".xml") != filename.size() - 4) |
1146 | switch (synset_id / 100000000) | ||
1147 | { | ||
1148 | case 1: // Noun | ||
1149 | { | 344 | { |
1150 | if (nouns.count(word) == 1) | 345 | continue; |
1151 | { | ||
1152 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1153 | } else { | ||
1154 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; | ||
1155 | } | ||
1156 | |||
1157 | break; | ||
1158 | } | 346 | } |
1159 | 347 | ||
1160 | case 2: // Verb | 348 | xmlDocPtr doc = xmlParseFile(filename.c_str()); |
349 | if (doc == nullptr) | ||
1161 | { | 350 | { |
1162 | // Ignore | 351 | throw std::logic_error("Error opening " + filename); |
1163 | |||
1164 | break; | ||
1165 | } | 352 | } |
1166 | 353 | ||
1167 | case 3: // Adjective | 354 | xmlNodePtr top = xmlDocGetRootElement(doc); |
355 | if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | ||
1168 | { | 356 | { |
1169 | if (adjectives.count(word) == 1) | 357 | throw std::logic_error("Bad VerbNet file format: " + filename); |
1170 | { | ||
1171 | query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | ||
1172 | } else { | ||
1173 | query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; | ||
1174 | } | ||
1175 | |||
1176 | break; | ||
1177 | } | 358 | } |
1178 | 359 | ||
1179 | case 4: // Adverb | 360 | try |
1180 | { | 361 | { |
1181 | if (adjectives.count(word) == 1) | 362 | createGroup(top); |
1182 | { | 363 | } catch (const std::exception& e) |
1183 | query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | 364 | { |
1184 | } else { | 365 | std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); |
1185 | query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; | ||
1186 | } | ||
1187 | |||
1188 | break; | ||
1189 | } | 366 | } |
1190 | } | 367 | } |
368 | |||
369 | closedir(dir); | ||
370 | } | ||
1191 | 371 | ||
1192 | sqlite3_stmt* ppstmt; | 372 | void generator::readAgidInflections() |
1193 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 373 | { |
374 | std::list<std::string> lines(readFile(agidPath_)); | ||
375 | progress ppgs("Reading inflections from AGID...", lines.size()); | ||
376 | |||
377 | for (std::string line : lines) | ||
1194 | { | 378 | { |
1195 | db_error(ppdb, query); | 379 | ppgs.update(); |
1196 | } | 380 | |
381 | int divider = line.find_first_of(" "); | ||
382 | std::string infinitive = line.substr(0, divider); | ||
383 | line = line.substr(divider+1); | ||
384 | char type = line[0]; | ||
1197 | 385 | ||
1198 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); | 386 | if (line[1] == '?') |
1199 | switch (synset_id / 100000000) | ||
1200 | { | ||
1201 | case 1: // Noun | ||
1202 | { | 387 | { |
1203 | sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { | 388 | line.erase(0, 4); |
1204 | return isupper(ch); | 389 | } else { |
1205 | }) ? 1 : 0)); | 390 | line.erase(0, 3); |
1206 | |||
1207 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | ||
1208 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
1209 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
1210 | |||
1211 | if (nouns.count(word) == 1) | ||
1212 | { | ||
1213 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); | ||
1214 | } | ||
1215 | |||
1216 | break; | ||
1217 | } | 391 | } |
1218 | 392 | ||
1219 | case 3: // Adjective | 393 | if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) |
1220 | case 4: // Adverb | ||
1221 | { | 394 | { |
1222 | sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); | 395 | continue; |
1223 | 396 | } | |
1224 | if (adjectives.count(word) == 1) | 397 | |
398 | lemma& curLemma = lookupOrCreateLemma(infinitive); | ||
399 | |||
400 | auto forms = split<std::vector<std::string>>(line, " | "); | ||
401 | for (std::string& inflForm : forms) | ||
402 | { | ||
403 | int sympos = inflForm.find_first_of(",?"); | ||
404 | if (sympos != std::string::npos) | ||
1225 | { | 405 | { |
1226 | sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); | 406 | inflForm = inflForm.substr(0, sympos); |
1227 | sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); | ||
1228 | } | 407 | } |
1229 | |||
1230 | break; | ||
1231 | } | 408 | } |
1232 | } | ||
1233 | 409 | ||
1234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 410 | switch (type) |
1235 | { | ||
1236 | db_error(ppdb, query); | ||
1237 | } | ||
1238 | |||
1239 | sqlite3_finalize(ppstmt); | ||
1240 | |||
1241 | query = "SELECT last_insert_rowid()"; | ||
1242 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1243 | { | ||
1244 | db_error(ppdb, query); | ||
1245 | } | ||
1246 | |||
1247 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
1248 | { | ||
1249 | db_error(ppdb, query); | ||
1250 | } | ||
1251 | |||
1252 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
1253 | wn[synset_id][wnum] = rowid; | ||
1254 | |||
1255 | sqlite3_finalize(ppstmt); | ||
1256 | |||
1257 | std::string canonical(word); | ||
1258 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
1259 | if (pronunciations.count(canonical) == 1) | ||
1260 | { | ||
1261 | for (auto pronunciation : pronunciations[canonical]) | ||
1262 | { | 411 | { |
1263 | switch (synset_id / 100000000) | 412 | case 'V': |
1264 | { | 413 | { |
1265 | case 1: // Noun | 414 | if (forms.size() == 4) |
1266 | { | 415 | { |
1267 | if (!pronunciation.rhyme.empty()) | 416 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1268 | { | 417 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); |
1269 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 418 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); |
1270 | } else { | 419 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); |
1271 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 420 | } else if (forms.size() == 3) |
1272 | } | ||
1273 | |||
1274 | break; | ||
1275 | } | ||
1276 | |||
1277 | case 3: // Adjective | ||
1278 | { | 421 | { |
1279 | if (!pronunciation.rhyme.empty()) | 422 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1280 | { | 423 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); |
1281 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 424 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); |
1282 | } else { | 425 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); |
1283 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 426 | } else if (forms.size() == 8) |
1284 | } | 427 | { |
1285 | 428 | // As of AGID 2014.08.11, this is only "to be" | |
1286 | break; | 429 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
430 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); | ||
431 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); | ||
432 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); | ||
433 | } else { | ||
434 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
435 | // - may and shall do not conjugate the way we want them to | ||
436 | // - methinks only has a past tense and is an outlier | ||
437 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
438 | std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1287 | } | 439 | } |
1288 | 440 | ||
1289 | case 4: // Adverb | 441 | // For verbs in particular, we sometimes create a notion and a word |
442 | // from inflection data. Specifically, if there are not yet any | ||
443 | // verbs existing that have the same infinitive form. "Yet" means | ||
444 | // that this verb appears in the AGID data but not in either WordNet | ||
445 | // or VerbNet. | ||
446 | if (!wordsByBaseForm_.count(infinitive) | ||
447 | || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { | ||
448 | return w->getNotion().getPartOfSpeech() == part_of_speech::verb; | ||
449 | })) | ||
1290 | { | 450 | { |
1291 | if (!pronunciation.rhyme.empty()) | 451 | notion& n = createNotion(part_of_speech::verb); |
1292 | { | 452 | createWord(n, curLemma); |
1293 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1294 | } else { | ||
1295 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
1296 | } | ||
1297 | |||
1298 | break; | ||
1299 | } | 453 | } |
1300 | } | ||
1301 | |||
1302 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1303 | { | ||
1304 | db_error(ppdb, query); | ||
1305 | } | ||
1306 | |||
1307 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
1308 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
1309 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
1310 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
1311 | |||
1312 | if (!pronunciation.rhyme.empty()) | ||
1313 | { | ||
1314 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
1315 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
1316 | } | ||
1317 | 454 | ||
1318 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 455 | break; |
1319 | { | ||
1320 | db_error(ppdb, query); | ||
1321 | } | 456 | } |
1322 | |||
1323 | sqlite3_finalize(ppstmt); | ||
1324 | } | ||
1325 | } | ||
1326 | } | ||
1327 | } | ||
1328 | |||
1329 | // While we're working on s | ||
1330 | { | ||
1331 | progress ppgs("Writing word synonyms...", wn.size()); | ||
1332 | for (auto sense : wn) | ||
1333 | { | ||
1334 | ppgs.update(); | ||
1335 | 457 | ||
1336 | for (auto word1 : sense.second) | 458 | case 'A': |
1337 | { | ||
1338 | for (auto word2 : sense.second) | ||
1339 | { | ||
1340 | if (word1 != word2) | ||
1341 | { | 459 | { |
1342 | std::string query; | 460 | if (forms.size() == 2) |
1343 | switch (sense.first / 100000000) | ||
1344 | { | 461 | { |
1345 | case 1: // Noun | 462 | curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); |
1346 | { | 463 | curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); |
1347 | query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | 464 | } else { |
1348 | 465 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | |
1349 | break; | 466 | std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; |
1350 | } | 467 | } |
1351 | |||
1352 | case 2: // Verb | ||
1353 | { | ||
1354 | // Ignore | ||
1355 | |||
1356 | break; | ||
1357 | } | ||
1358 | |||
1359 | case 3: // Adjective | ||
1360 | { | ||
1361 | query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1362 | 468 | ||
1363 | break; | 469 | break; |
1364 | } | 470 | } |
1365 | 471 | ||
1366 | case 4: // Adverb | 472 | case 'N': |
1367 | { | 473 | { |
1368 | query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | 474 | if (forms.size() == 1) |
1369 | |||
1370 | break; | ||
1371 | } | ||
1372 | } | ||
1373 | |||
1374 | sqlite3_stmt* ppstmt; | ||
1375 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1376 | { | ||
1377 | db_error(ppdb, query); | ||
1378 | } | ||
1379 | |||
1380 | sqlite3_bind_int(ppstmt, 1, word1.second); | ||
1381 | sqlite3_bind_int(ppstmt, 2, word2.second); | ||
1382 | |||
1383 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1384 | { | 475 | { |
1385 | db_error(ppdb, query); | 476 | curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); |
477 | } else { | ||
478 | // As of AGID 2014.08.11, this is non-existent. | ||
479 | std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1386 | } | 480 | } |
1387 | 481 | ||
1388 | sqlite3_finalize(ppstmt); | 482 | break; |
1389 | } | 483 | } |
1390 | } | 484 | } |
1391 | } | 485 | } |
1392 | } | 486 | } |
1393 | } | ||
1394 | |||
1395 | // ant table | ||
1396 | { | ||
1397 | std::ifstream wnantfile(wnpref + "wn_ant.pl"); | ||
1398 | if (!wnantfile.is_open()) | ||
1399 | { | ||
1400 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1401 | print_usage(); | ||
1402 | } | ||
1403 | |||
1404 | std::list<std::string> lines; | ||
1405 | for (;;) | ||
1406 | { | ||
1407 | std::string line; | ||
1408 | if (!getline(wnantfile, line)) | ||
1409 | { | ||
1410 | break; | ||
1411 | } | ||
1412 | 487 | ||
1413 | if (line.back() == '\r') | 488 | void generator::readPrepositions() |
1414 | { | ||
1415 | line.pop_back(); | ||
1416 | } | ||
1417 | |||
1418 | lines.push_back(line); | ||
1419 | } | ||
1420 | |||
1421 | progress ppgs("Writing antonyms...", lines.size()); | ||
1422 | for (auto line : lines) | ||
1423 | { | 489 | { |
1424 | ppgs.update(); | 490 | std::list<std::string> lines(readFile("prepositions.txt")); |
491 | progress ppgs("Reading prepositions...", lines.size()); | ||
1425 | 492 | ||
1426 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | 493 | for (std::string line : lines) |
1427 | std::smatch relation_data; | ||
1428 | if (!std::regex_search(line, relation_data, relation)) | ||
1429 | { | ||
1430 | continue; | ||
1431 | } | ||
1432 | |||
1433 | int synset_id_1 = stoi(relation_data[1]); | ||
1434 | int wnum_1 = stoi(relation_data[2]); | ||
1435 | int synset_id_2 = stoi(relation_data[3]); | ||
1436 | int wnum_2 = stoi(relation_data[4]); | ||
1437 | |||
1438 | std::string query; | ||
1439 | switch (synset_id_1 / 100000000) | ||
1440 | { | 494 | { |
1441 | case 1: // Noun | 495 | ppgs.update(); |
1442 | { | ||
1443 | query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1444 | 496 | ||
1445 | break; | 497 | std::regex relation("^([^:]+): (.+)"); |
1446 | } | 498 | std::smatch relation_data; |
1447 | 499 | std::regex_search(line, relation_data, relation); | |
1448 | case 2: // Verb | 500 | std::string prep = relation_data[1]; |
1449 | { | 501 | auto groups = split<std::list<std::string>>(relation_data[2], ", "); |
1450 | // Ignore | ||
1451 | 502 | ||
1452 | break; | 503 | notion& n = createNotion(part_of_speech::preposition); |
1453 | } | 504 | lemma& l = lookupOrCreateLemma(prep); |
1454 | 505 | word& w = createWord(n, l); | |
1455 | case 3: // Adjective | ||
1456 | { | ||
1457 | query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1458 | 506 | ||
1459 | break; | 507 | n.setPrepositionGroups(groups); |
1460 | } | ||
1461 | |||
1462 | case 4: // Adverb | ||
1463 | { | ||
1464 | query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1465 | |||
1466 | break; | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1470 | sqlite3_stmt* ppstmt; | ||
1471 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1472 | { | ||
1473 | db_error(ppdb, query); | ||
1474 | } | ||
1475 | |||
1476 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1477 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1478 | |||
1479 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1480 | { | ||
1481 | db_error(ppdb, query); | ||
1482 | } | ||
1483 | |||
1484 | sqlite3_finalize(ppstmt); | ||
1485 | } | ||
1486 | } | ||
1487 | |||
1488 | // at table | ||
1489 | { | ||
1490 | std::ifstream wnatfile(wnpref + "wn_at.pl"); | ||
1491 | if (!wnatfile.is_open()) | ||
1492 | { | ||
1493 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1494 | print_usage(); | ||
1495 | } | ||
1496 | |||
1497 | std::list<std::string> lines; | ||
1498 | for (;;) | ||
1499 | { | ||
1500 | std::string line; | ||
1501 | if (!getline(wnatfile, line)) | ||
1502 | { | ||
1503 | break; | ||
1504 | } | 508 | } |
1505 | |||
1506 | if (line.back() == '\r') | ||
1507 | { | ||
1508 | line.pop_back(); | ||
1509 | } | ||
1510 | |||
1511 | lines.push_back(line); | ||
1512 | } | 509 | } |
1513 | 510 | ||
1514 | progress ppgs("Writing variations...", lines.size()); | 511 | void generator::readCmudictPronunciations() |
1515 | for (auto line : lines) | ||
1516 | { | 512 | { |
1517 | ppgs.update(); | 513 | std::list<std::string> lines(readFile(cmudictPath_)); |
514 | progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); | ||
1518 | 515 | ||
1519 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); | 516 | for (std::string line : lines) |
1520 | std::smatch relation_data; | ||
1521 | if (!std::regex_search(line, relation_data, relation)) | ||
1522 | { | 517 | { |
1523 | continue; | 518 | ppgs.update(); |
1524 | } | 519 | |
1525 | 520 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | |
1526 | int synset_id_1 = stoi(relation_data[1]); | 521 | std::smatch phoneme_data; |
1527 | int synset_id_2 = stoi(relation_data[2]); | 522 | if (std::regex_search(line, phoneme_data, phoneme)) |
1528 | std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); | ||
1529 | |||
1530 | for (auto mapping1 : wn[synset_id_1]) | ||
1531 | { | ||
1532 | for (auto mapping2 : wn[synset_id_2]) | ||
1533 | { | 523 | { |
1534 | sqlite3_stmt* ppstmt; | 524 | std::string canonical(phoneme_data[1]); |
1535 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 525 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); |
1536 | { | ||
1537 | db_error(ppdb, query); | ||
1538 | } | ||
1539 | |||
1540 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1541 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1542 | 526 | ||
1543 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 527 | if (!formByText_.count(canonical)) |
1544 | { | 528 | { |
1545 | db_error(ppdb, query); | 529 | continue; |
1546 | } | 530 | } |
1547 | 531 | ||
1548 | sqlite3_finalize(ppstmt); | 532 | std::string phonemes = phoneme_data[2]; |
533 | pronunciations_.emplace_back(phonemes); | ||
534 | pronunciation& p = pronunciations_.back(); | ||
535 | formByText_.at(canonical)->addPronunciation(p); | ||
1549 | } | 536 | } |
1550 | } | 537 | } |
1551 | } | 538 | } |
1552 | } | ||
1553 | |||
1554 | // der table | ||
1555 | { | ||
1556 | std::ifstream wnderfile(wnpref + "wn_der.pl"); | ||
1557 | if (!wnderfile.is_open()) | ||
1558 | { | ||
1559 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1560 | print_usage(); | ||
1561 | } | ||
1562 | 539 | ||
1563 | std::list<std::string> lines; | 540 | void generator::writeSchema() |
1564 | for (;;) | ||
1565 | { | 541 | { |
1566 | std::string line; | 542 | std::ifstream file("schema.sql"); |
1567 | if (!getline(wnderfile, line)) | 543 | if (!file) |
1568 | { | 544 | { |
1569 | break; | 545 | throw std::invalid_argument("Could not find database schema"); |
1570 | } | 546 | } |
1571 | 547 | ||
1572 | if (line.back() == '\r') | 548 | std::ostringstream schemaBuilder; |
549 | std::string line; | ||
550 | while (std::getline(file, line)) | ||
1573 | { | 551 | { |
1574 | line.pop_back(); | 552 | if (line.back() == '\r') |
553 | { | ||
554 | line.pop_back(); | ||
555 | } | ||
556 | |||
557 | schemaBuilder << line; | ||
1575 | } | 558 | } |
1576 | 559 | ||
1577 | lines.push_back(line); | 560 | std::string schema = schemaBuilder.str(); |
561 | auto queries = split<std::list<std::string>>(schema, ";"); | ||
562 | progress ppgs("Writing database schema...", queries.size()); | ||
563 | for (std::string query : queries) | ||
564 | { | ||
565 | if (!queries.empty()) | ||
566 | { | ||
567 | db_.runQuery(query); | ||
568 | } | ||
569 | |||
570 | ppgs.update(); | ||
571 | } | ||
1578 | } | 572 | } |
1579 | 573 | ||
1580 | progress ppgs("Writing morphological derivation...", lines.size()); | 574 | void generator::dumpObjects() |
1581 | for (auto line : lines) | ||
1582 | { | 575 | { |
1583 | ppgs.update(); | ||
1584 | |||
1585 | std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
1586 | std::smatch relation_data; | ||
1587 | if (!std::regex_search(line, relation_data, relation)) | ||
1588 | { | 576 | { |
1589 | continue; | 577 | progress ppgs("Writing notions...", notions_.size()); |
578 | |||
579 | for (notion& n : notions_) | ||
580 | { | ||
581 | db_ << n; | ||
582 | |||
583 | ppgs.update(); | ||
584 | } | ||
1590 | } | 585 | } |
1591 | 586 | ||
1592 | int synset_id_1 = stoi(relation_data[1]); | ||
1593 | int wnum_1 = stoi(relation_data[2]); | ||
1594 | int synset_id_2 = stoi(relation_data[3]); | ||
1595 | int wnum_2 = stoi(relation_data[4]); | ||
1596 | std::string query; | ||
1597 | switch (synset_id_1 / 100000000) | ||
1598 | { | 587 | { |
1599 | case 1: // Noun | 588 | progress ppgs("Writing words...", words_.size()); |
589 | |||
590 | for (word& w : words_) | ||
1600 | { | 591 | { |
1601 | switch (synset_id_2 / 100000000) | 592 | db_ << w; |
1602 | { | ||
1603 | case 1: // Noun | ||
1604 | { | ||
1605 | query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1606 | break; | ||
1607 | } | ||
1608 | |||
1609 | case 3: // Adjective | ||
1610 | { | ||
1611 | query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; | ||
1612 | break; | ||
1613 | } | ||
1614 | |||
1615 | case 4: // Adverb | ||
1616 | { | ||
1617 | query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; | ||
1618 | break; | ||
1619 | } | ||
1620 | } | ||
1621 | 593 | ||
1622 | break; | 594 | ppgs.update(); |
1623 | } | 595 | } |
596 | } | ||
597 | |||
598 | { | ||
599 | progress ppgs("Writing lemmas...", lemmas_.size()); | ||
1624 | 600 | ||
1625 | case 3: // Adjective | 601 | for (lemma& l : lemmas_) |
1626 | { | 602 | { |
1627 | switch (synset_id_2 / 100000000) | 603 | db_ << l; |
1628 | { | ||
1629 | case 1: // Noun | ||
1630 | { | ||
1631 | query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; | ||
1632 | break; | ||
1633 | } | ||
1634 | |||
1635 | case 3: // Adjective | ||
1636 | { | ||
1637 | query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; | ||
1638 | break; | ||
1639 | } | ||
1640 | |||
1641 | case 4: // Adverb | ||
1642 | { | ||
1643 | query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; | ||
1644 | break; | ||
1645 | } | ||
1646 | } | ||
1647 | 604 | ||
1648 | break; | 605 | ppgs.update(); |
1649 | } | 606 | } |
607 | } | ||
608 | |||
609 | { | ||
610 | progress ppgs("Writing forms...", forms_.size()); | ||
1650 | 611 | ||
1651 | case 4: // Adverb | 612 | for (form& f : forms_) |
1652 | { | 613 | { |
1653 | switch (synset_id_2 / 100000000) | 614 | db_ << f; |
1654 | { | ||
1655 | case 1: // Noun | ||
1656 | { | ||
1657 | query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; | ||
1658 | break; | ||
1659 | } | ||
1660 | |||
1661 | case 3: // Adjective | ||
1662 | { | ||
1663 | query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; | ||
1664 | break; | ||
1665 | } | ||
1666 | |||
1667 | case 4: // Adverb | ||
1668 | { | ||
1669 | query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1670 | break; | ||
1671 | } | ||
1672 | } | ||
1673 | 615 | ||
1674 | break; | 616 | ppgs.update(); |
1675 | } | 617 | } |
1676 | } | 618 | } |
1677 | 619 | ||
1678 | sqlite3_stmt* ppstmt; | ||
1679 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1680 | { | 620 | { |
1681 | db_error(ppdb, query); | 621 | progress ppgs("Writing pronunciations...", pronunciations_.size()); |
622 | |||
623 | for (pronunciation& p : pronunciations_) | ||
624 | { | ||
625 | db_ << p; | ||
626 | |||
627 | ppgs.update(); | ||
628 | } | ||
1682 | } | 629 | } |
1683 | 630 | ||
1684 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1685 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1686 | |||
1687 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1688 | { | 631 | { |
1689 | db_error(ppdb, query); | 632 | progress ppgs("Writing verb groups...", groups_.size()); |
633 | |||
634 | for (group& g : groups_) | ||
635 | { | ||
636 | db_ << g; | ||
637 | |||
638 | ppgs.update(); | ||
639 | } | ||
1690 | } | 640 | } |
1691 | 641 | ||
1692 | sqlite3_finalize(ppstmt); | ||
1693 | } | ||
1694 | } | ||
1695 | |||
1696 | // hyp table | ||
1697 | { | ||
1698 | std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); | ||
1699 | if (!wnhypfile.is_open()) | ||
1700 | { | ||
1701 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1702 | print_usage(); | ||
1703 | } | ||
1704 | |||
1705 | std::list<std::string> lines; | ||
1706 | for (;;) | ||
1707 | { | ||
1708 | std::string line; | ||
1709 | if (!getline(wnhypfile, line)) | ||
1710 | { | ||
1711 | break; | ||
1712 | } | ||
1713 | |||
1714 | if (line.back() == '\r') | ||
1715 | { | 642 | { |
1716 | line.pop_back(); | 643 | progress ppgs("Writing verb frames...", frames_.size()); |
644 | |||
645 | for (frame& f : frames_) | ||
646 | { | ||
647 | db_ << f; | ||
648 | |||
649 | ppgs.update(); | ||
650 | } | ||
1717 | } | 651 | } |
1718 | |||
1719 | lines.push_back(line); | ||
1720 | } | 652 | } |
1721 | 653 | ||
1722 | progress ppgs("Writing hypernyms...", lines.size()); | 654 | void generator::readWordNetAntonymy() |
1723 | for (auto line : lines) | ||
1724 | { | 655 | { |
1725 | ppgs.update(); | 656 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); |
1726 | 657 | progress ppgs("Writing antonyms...", lines.size()); | |
1727 | std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); | 658 | for (auto line : lines) |
1728 | std::smatch relation_data; | ||
1729 | if (!std::regex_search(line, relation_data, relation)) | ||
1730 | { | 659 | { |
1731 | continue; | 660 | ppgs.update(); |
1732 | } | ||
1733 | |||
1734 | int synset_id_1 = stoi(relation_data[1]); | ||
1735 | int synset_id_2 = stoi(relation_data[2]); | ||
1736 | std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); | ||
1737 | 661 | ||
1738 | for (auto mapping1 : wn[synset_id_1]) | 662 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); |
1739 | { | 663 | std::smatch relation_data; |
1740 | for (auto mapping2 : wn[synset_id_2]) | 664 | if (!std::regex_search(line, relation_data, relation)) |
1741 | { | 665 | { |
1742 | sqlite3_stmt* ppstmt; | 666 | continue; |
1743 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 667 | } |
1744 | { | 668 | |
1745 | db_error(ppdb, query); | 669 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
1746 | } | 670 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1747 | 671 | ||
1748 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 672 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) |
1749 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 673 | { |
674 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
675 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1750 | 676 | ||
1751 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 677 | std::list<field> fields; |
1752 | { | 678 | fields.emplace_back("antonym_1_id", word1.getId()); |
1753 | db_error(ppdb, query); | 679 | fields.emplace_back("antonym_2_id", word2.getId()); |
1754 | } | ||
1755 | 680 | ||
1756 | sqlite3_finalize(ppstmt); | 681 | db_.insertIntoTable("antonymy", std::move(fields)); |
1757 | } | 682 | } |
1758 | } | 683 | } |
1759 | } | 684 | } |
1760 | } | ||
1761 | |||
1762 | // ins table | ||
1763 | { | ||
1764 | std::ifstream wninsfile(wnpref + "wn_ins.pl"); | ||
1765 | if (!wninsfile.is_open()) | ||
1766 | { | ||
1767 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1768 | print_usage(); | ||
1769 | } | ||
1770 | |||
1771 | std::list<std::string> lines; | ||
1772 | for (;;) | ||
1773 | { | ||
1774 | std::string line; | ||
1775 | if (!getline(wninsfile, line)) | ||
1776 | { | ||
1777 | break; | ||
1778 | } | ||
1779 | 685 | ||
1780 | if (line.back() == '\r') | 686 | void generator::readWordNetVariation() |
687 | { | ||
688 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); | ||
689 | progress ppgs("Writing variation...", lines.size()); | ||
690 | for (auto line : lines) | ||
1781 | { | 691 | { |
1782 | line.pop_back(); | 692 | ppgs.update(); |
1783 | } | ||
1784 | 693 | ||
1785 | lines.push_back(line); | 694 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); |
695 | std::smatch relation_data; | ||
696 | if (!std::regex_search(line, relation_data, relation)) | ||
697 | { | ||
698 | continue; | ||
699 | } | ||
700 | |||
701 | int lookup1 = std::stoi(relation_data[1]); | ||
702 | int lookup2 = std::stoi(relation_data[2]); | ||
703 | |||
704 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
705 | { | ||
706 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
707 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
708 | |||
709 | std::list<field> fields; | ||
710 | fields.emplace_back("noun_id", notion1.getId()); | ||
711 | fields.emplace_back("adjective_id", notion2.getId()); | ||
712 | |||
713 | db_.insertIntoTable("variation", std::move(fields)); | ||
714 | } | ||
715 | } | ||
1786 | } | 716 | } |
1787 | 717 | ||
1788 | progress ppgs("Writing instantiations...", lines.size()); | 718 | void generator::readWordNetClasses() |
1789 | for (auto line : lines) | ||
1790 | { | 719 | { |
1791 | ppgs.update(); | 720 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); |
1792 | 721 | progress ppgs("Writing usage, topicality, and regionality...", lines.size()); | |
1793 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); | 722 | for (auto line : lines) |
1794 | std::smatch relation_data; | ||
1795 | if (!std::regex_search(line, relation_data, relation)) | ||
1796 | { | 723 | { |
1797 | continue; | 724 | ppgs.update(); |
1798 | } | ||
1799 | |||
1800 | int synset_id_1 = stoi(relation_data[1]); | ||
1801 | int synset_id_2 = stoi(relation_data[2]); | ||
1802 | std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); | ||
1803 | 725 | ||
1804 | for (auto mapping1 : wn[synset_id_1]) | 726 | std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); |
1805 | { | 727 | std::smatch relation_data; |
1806 | for (auto mapping2 : wn[synset_id_2]) | 728 | if (!std::regex_search(line, relation_data, relation)) |
729 | { | ||
730 | continue; | ||
731 | } | ||
732 | |||
733 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
734 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
735 | std::string class_type = relation_data[5]; | ||
736 | |||
737 | std::string table_name; | ||
738 | if (class_type == "t") | ||
739 | { | ||
740 | table_name += "topicality"; | ||
741 | } else if (class_type == "u") | ||
742 | { | ||
743 | table_name += "usage"; | ||
744 | } else if (class_type == "r") | ||
745 | { | ||
746 | table_name += "regionality"; | ||
747 | } | ||
748 | |||
749 | std::list<int> leftJoin; | ||
750 | std::list<int> rightJoin; | ||
751 | |||
752 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) | ||
1807 | { | 753 | { |
1808 | sqlite3_stmt* ppstmt; | 754 | std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { |
1809 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 755 | return w->getId(); |
756 | }); | ||
757 | } else if (wordByWnidAndWnum_.count(lookup1)) { | ||
758 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); | ||
759 | } | ||
760 | |||
761 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) | ||
762 | { | ||
763 | std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { | ||
764 | return w->getId(); | ||
765 | }); | ||
766 | } else if (wordByWnidAndWnum_.count(lookup2)) { | ||
767 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); | ||
768 | } | ||
769 | |||
770 | for (int word1 : leftJoin) | ||
771 | { | ||
772 | for (int word2 : rightJoin) | ||
1810 | { | 773 | { |
1811 | db_error(ppdb, query); | 774 | std::list<field> fields; |
1812 | } | 775 | fields.emplace_back("term_id", word1); |
776 | fields.emplace_back("domain_id", word2); | ||
1813 | 777 | ||
1814 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 778 | db_.insertIntoTable(table_name, std::move(fields)); |
1815 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1816 | |||
1817 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1818 | { | ||
1819 | db_error(ppdb, query); | ||
1820 | } | 779 | } |
1821 | |||
1822 | sqlite3_finalize(ppstmt); | ||
1823 | } | 780 | } |
1824 | } | 781 | } |
1825 | } | 782 | } |
1826 | } | ||
1827 | |||
1828 | // mm table | ||
1829 | { | ||
1830 | std::ifstream wnmmfile(wnpref + "wn_mm.pl"); | ||
1831 | if (!wnmmfile.is_open()) | ||
1832 | { | ||
1833 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1834 | print_usage(); | ||
1835 | } | ||
1836 | |||
1837 | std::list<std::string> lines; | ||
1838 | for (;;) | ||
1839 | { | ||
1840 | std::string line; | ||
1841 | if (!getline(wnmmfile, line)) | ||
1842 | { | ||
1843 | break; | ||
1844 | } | ||
1845 | 783 | ||
1846 | if (line.back() == '\r') | 784 | void generator::readWordNetCausality() |
785 | { | ||
786 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); | ||
787 | progress ppgs("Writing causality...", lines.size()); | ||
788 | for (auto line : lines) | ||
1847 | { | 789 | { |
1848 | line.pop_back(); | 790 | ppgs.update(); |
1849 | } | ||
1850 | 791 | ||
1851 | lines.push_back(line); | 792 | std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); |
793 | std::smatch relation_data; | ||
794 | if (!std::regex_search(line, relation_data, relation)) | ||
795 | { | ||
796 | continue; | ||
797 | } | ||
798 | |||
799 | int lookup1 = std::stoi(relation_data[1]); | ||
800 | int lookup2 = std::stoi(relation_data[2]); | ||
801 | |||
802 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
803 | { | ||
804 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
805 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
806 | |||
807 | std::list<field> fields; | ||
808 | fields.emplace_back("effect_id", notion1.getId()); | ||
809 | fields.emplace_back("cause_id", notion2.getId()); | ||
810 | |||
811 | db_.insertIntoTable("causality", std::move(fields)); | ||
812 | } | ||
813 | } | ||
1852 | } | 814 | } |
1853 | 815 | ||
1854 | progress ppgs("Writing member meronyms...", lines.size()); | 816 | void generator::readWordNetEntailment() |
1855 | for (auto line : lines) | ||
1856 | { | 817 | { |
1857 | ppgs.update(); | 818 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); |
1858 | 819 | progress ppgs("Writing entailment...", lines.size()); | |
1859 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); | 820 | for (auto line : lines) |
1860 | std::smatch relation_data; | ||
1861 | if (!std::regex_search(line, relation_data, relation)) | ||
1862 | { | 821 | { |
1863 | continue; | 822 | ppgs.update(); |
1864 | } | ||
1865 | 823 | ||
1866 | int synset_id_1 = stoi(relation_data[1]); | 824 | std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); |
1867 | int synset_id_2 = stoi(relation_data[2]); | 825 | std::smatch relation_data; |
1868 | std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | 826 | if (!std::regex_search(line, relation_data, relation)) |
1869 | |||
1870 | for (auto mapping1 : wn[synset_id_1]) | ||
1871 | { | ||
1872 | for (auto mapping2 : wn[synset_id_2]) | ||
1873 | { | 827 | { |
1874 | sqlite3_stmt* ppstmt; | 828 | continue; |
1875 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 829 | } |
1876 | { | 830 | |
1877 | db_error(ppdb, query); | 831 | int lookup1 = std::stoi(relation_data[1]); |
1878 | } | 832 | int lookup2 = std::stoi(relation_data[2]); |
1879 | 833 | ||
1880 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 834 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
1881 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 835 | { |
836 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
837 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1882 | 838 | ||
1883 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 839 | std::list<field> fields; |
1884 | { | 840 | fields.emplace_back("given_id", notion1.getId()); |
1885 | db_error(ppdb, query); | 841 | fields.emplace_back("entailment_id", notion2.getId()); |
1886 | } | ||
1887 | 842 | ||
1888 | sqlite3_finalize(ppstmt); | 843 | db_.insertIntoTable("entailment", std::move(fields)); |
1889 | } | 844 | } |
1890 | } | 845 | } |
1891 | } | 846 | } |
1892 | } | 847 | |
1893 | 848 | void generator::readWordNetHypernymy() | |
1894 | // ms table | ||
1895 | { | ||
1896 | std::ifstream wnmsfile(wnpref + "wn_ms.pl"); | ||
1897 | if (!wnmsfile.is_open()) | ||
1898 | { | ||
1899 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1900 | print_usage(); | ||
1901 | } | ||
1902 | |||
1903 | std::list<std::string> lines; | ||
1904 | for (;;) | ||
1905 | { | 849 | { |
1906 | std::string line; | 850 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); |
1907 | if (!getline(wnmsfile, line)) | 851 | progress ppgs("Writing hypernymy...", lines.size()); |
852 | for (auto line : lines) | ||
1908 | { | 853 | { |
1909 | break; | 854 | ppgs.update(); |
855 | |||
856 | std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); | ||
857 | std::smatch relation_data; | ||
858 | if (!std::regex_search(line, relation_data, relation)) | ||
859 | { | ||
860 | continue; | ||
861 | } | ||
862 | |||
863 | int lookup1 = std::stoi(relation_data[1]); | ||
864 | int lookup2 = std::stoi(relation_data[2]); | ||
865 | |||
866 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
867 | { | ||
868 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
869 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
870 | |||
871 | std::list<field> fields; | ||
872 | fields.emplace_back("hyponym_id", notion1.getId()); | ||
873 | fields.emplace_back("hypernym_id", notion2.getId()); | ||
874 | |||
875 | db_.insertIntoTable("hypernymy", std::move(fields)); | ||
876 | } | ||
1910 | } | 877 | } |
878 | } | ||
1911 | 879 | ||
1912 | if (line.back() == '\r') | 880 | void generator::readWordNetInstantiation() |
881 | { | ||
882 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); | ||
883 | progress ppgs("Writing instantiation...", lines.size()); | ||
884 | for (auto line : lines) | ||
1913 | { | 885 | { |
1914 | line.pop_back(); | 886 | ppgs.update(); |
1915 | } | ||
1916 | 887 | ||
1917 | lines.push_back(line); | 888 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); |
889 | std::smatch relation_data; | ||
890 | if (!std::regex_search(line, relation_data, relation)) | ||
891 | { | ||
892 | continue; | ||
893 | } | ||
894 | |||
895 | int lookup1 = std::stoi(relation_data[1]); | ||
896 | int lookup2 = std::stoi(relation_data[2]); | ||
897 | |||
898 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
899 | { | ||
900 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
901 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
902 | |||
903 | std::list<field> fields; | ||
904 | fields.emplace_back("instance_id", notion1.getId()); | ||
905 | fields.emplace_back("class_id", notion2.getId()); | ||
906 | |||
907 | db_.insertIntoTable("instantiation", std::move(fields)); | ||
908 | } | ||
909 | } | ||
1918 | } | 910 | } |
1919 | 911 | ||
1920 | progress ppgs("Writing substance meronyms...", lines.size()); | 912 | void generator::readWordNetMemberMeronymy() |
1921 | for (auto line : lines) | ||
1922 | { | 913 | { |
1923 | ppgs.update(); | 914 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); |
1924 | 915 | progress ppgs("Writing member meronymy...", lines.size()); | |
1925 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); | 916 | for (auto line : lines) |
1926 | std::smatch relation_data; | ||
1927 | if (!std::regex_search(line, relation_data, relation)) | ||
1928 | { | 917 | { |
1929 | continue; | 918 | ppgs.update(); |
1930 | } | ||
1931 | |||
1932 | int synset_id_1 = stoi(relation_data[1]); | ||
1933 | int synset_id_2 = stoi(relation_data[2]); | ||
1934 | std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1935 | 919 | ||
1936 | for (auto mapping1 : wn[synset_id_1]) | 920 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); |
1937 | { | 921 | std::smatch relation_data; |
1938 | for (auto mapping2 : wn[synset_id_2]) | 922 | if (!std::regex_search(line, relation_data, relation)) |
1939 | { | 923 | { |
1940 | sqlite3_stmt* ppstmt; | 924 | continue; |
1941 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 925 | } |
1942 | { | 926 | |
1943 | db_error(ppdb, query); | 927 | int lookup1 = std::stoi(relation_data[1]); |
1944 | } | 928 | int lookup2 = std::stoi(relation_data[2]); |
929 | |||
930 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
931 | { | ||
932 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
933 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1945 | 934 | ||
1946 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 935 | std::list<field> fields; |
1947 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 936 | fields.emplace_back("holonym_id", notion1.getId()); |
937 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1948 | 938 | ||
1949 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 939 | db_.insertIntoTable("member_meronymy", std::move(fields)); |
1950 | { | ||
1951 | db_error(ppdb, query); | ||
1952 | } | ||
1953 | |||
1954 | sqlite3_finalize(ppstmt); | ||
1955 | } | 940 | } |
1956 | } | 941 | } |
1957 | } | 942 | } |
1958 | } | 943 | |
1959 | 944 | void generator::readWordNetPartMeronymy() | |
1960 | // mm table | ||
1961 | { | ||
1962 | std::ifstream wnmpfile(wnpref + "wn_mp.pl"); | ||
1963 | if (!wnmpfile.is_open()) | ||
1964 | { | ||
1965 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1966 | print_usage(); | ||
1967 | } | ||
1968 | |||
1969 | std::list<std::string> lines; | ||
1970 | for (;;) | ||
1971 | { | 945 | { |
1972 | std::string line; | 946 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); |
1973 | if (!getline(wnmpfile, line)) | 947 | progress ppgs("Writing part meronymy...", lines.size()); |
948 | for (auto line : lines) | ||
1974 | { | 949 | { |
1975 | break; | 950 | ppgs.update(); |
951 | |||
952 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
953 | std::smatch relation_data; | ||
954 | if (!std::regex_search(line, relation_data, relation)) | ||
955 | { | ||
956 | continue; | ||
957 | } | ||
958 | |||
959 | int lookup1 = std::stoi(relation_data[1]); | ||
960 | int lookup2 = std::stoi(relation_data[2]); | ||
961 | |||
962 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
963 | { | ||
964 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
965 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
966 | |||
967 | std::list<field> fields; | ||
968 | fields.emplace_back("holonym_id", notion1.getId()); | ||
969 | fields.emplace_back("meronym_id", notion2.getId()); | ||
970 | |||
971 | db_.insertIntoTable("part_meronymy", std::move(fields)); | ||
972 | } | ||
1976 | } | 973 | } |
974 | } | ||
1977 | 975 | ||
1978 | if (line.back() == '\r') | 976 | void generator::readWordNetSubstanceMeronymy() |
977 | { | ||
978 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); | ||
979 | progress ppgs("Writing substance meronymy...", lines.size()); | ||
980 | for (auto line : lines) | ||
1979 | { | 981 | { |
1980 | line.pop_back(); | 982 | ppgs.update(); |
1981 | } | ||
1982 | 983 | ||
1983 | lines.push_back(line); | 984 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); |
985 | std::smatch relation_data; | ||
986 | if (!std::regex_search(line, relation_data, relation)) | ||
987 | { | ||
988 | continue; | ||
989 | } | ||
990 | |||
991 | int lookup1 = std::stoi(relation_data[1]); | ||
992 | int lookup2 = std::stoi(relation_data[2]); | ||
993 | |||
994 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
995 | { | ||
996 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
997 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
998 | |||
999 | std::list<field> fields; | ||
1000 | fields.emplace_back("holonym_id", notion1.getId()); | ||
1001 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1002 | |||
1003 | db_.insertIntoTable("substance_meronymy", std::move(fields)); | ||
1004 | } | ||
1005 | } | ||
1984 | } | 1006 | } |
1985 | 1007 | ||
1986 | progress ppgs("Writing part meronyms...", lines.size()); | 1008 | void generator::readWordNetPertainymy() |
1987 | for (auto line : lines) | ||
1988 | { | 1009 | { |
1989 | ppgs.update(); | 1010 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); |
1990 | 1011 | progress ppgs("Writing pertainymy and mannernymy...", lines.size()); | |
1991 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | 1012 | for (auto line : lines) |
1992 | std::smatch relation_data; | ||
1993 | if (!std::regex_search(line, relation_data, relation)) | ||
1994 | { | 1013 | { |
1995 | continue; | 1014 | ppgs.update(); |
1996 | } | ||
1997 | |||
1998 | int synset_id_1 = stoi(relation_data[1]); | ||
1999 | int synset_id_2 = stoi(relation_data[2]); | ||
2000 | std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
2001 | 1015 | ||
2002 | for (auto mapping1 : wn[synset_id_1]) | 1016 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); |
2003 | { | 1017 | std::smatch relation_data; |
2004 | for (auto mapping2 : wn[synset_id_2]) | 1018 | if (!std::regex_search(line, relation_data, relation)) |
2005 | { | 1019 | { |
2006 | sqlite3_stmt* ppstmt; | 1020 | continue; |
2007 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 1021 | } |
2008 | { | 1022 | |
2009 | db_error(ppdb, query); | 1023 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
2010 | } | 1024 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1025 | |||
1026 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1027 | { | ||
1028 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1029 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
2011 | 1030 | ||
2012 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1031 | if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) |
2013 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1032 | { |
1033 | std::list<field> fields; | ||
1034 | fields.emplace_back("pertainym_id", word1.getId()); | ||
1035 | fields.emplace_back("noun_id", word2.getId()); | ||
2014 | 1036 | ||
2015 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1037 | db_.insertIntoTable("pertainymy", std::move(fields)); |
1038 | } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) | ||
2016 | { | 1039 | { |
2017 | db_error(ppdb, query); | 1040 | std::list<field> fields; |
2018 | } | 1041 | fields.emplace_back("mannernym_id", word1.getId()); |
1042 | fields.emplace_back("adjective_id", word2.getId()); | ||
2019 | 1043 | ||
2020 | sqlite3_finalize(ppstmt); | 1044 | db_.insertIntoTable("mannernymy", std::move(fields)); |
1045 | } | ||
2021 | } | 1046 | } |
2022 | } | 1047 | } |
2023 | } | 1048 | } |
2024 | } | ||
2025 | |||
2026 | // per table | ||
2027 | { | ||
2028 | std::ifstream wnperfile(wnpref + "wn_per.pl"); | ||
2029 | if (!wnperfile.is_open()) | ||
2030 | { | ||
2031 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2032 | print_usage(); | ||
2033 | } | ||
2034 | |||
2035 | std::list<std::string> lines; | ||
2036 | for (;;) | ||
2037 | { | ||
2038 | std::string line; | ||
2039 | if (!getline(wnperfile, line)) | ||
2040 | { | ||
2041 | break; | ||
2042 | } | ||
2043 | 1049 | ||
2044 | if (line.back() == '\r') | 1050 | void generator::readWordNetSpecification() |
1051 | { | ||
1052 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); | ||
1053 | progress ppgs("Writing specifications...", lines.size()); | ||
1054 | for (auto line : lines) | ||
2045 | { | 1055 | { |
2046 | line.pop_back(); | 1056 | ppgs.update(); |
1057 | |||
1058 | std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); | ||
1059 | std::smatch relation_data; | ||
1060 | if (!std::regex_search(line, relation_data, relation)) | ||
1061 | { | ||
1062 | continue; | ||
1063 | } | ||
1064 | |||
1065 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
1066 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
1067 | |||
1068 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1069 | { | ||
1070 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1071 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1072 | |||
1073 | std::list<field> fields; | ||
1074 | fields.emplace_back("general_id", word1.getId()); | ||
1075 | fields.emplace_back("specific_id", word2.getId()); | ||
1076 | |||
1077 | db_.insertIntoTable("specification", std::move(fields)); | ||
1078 | } | ||
2047 | } | 1079 | } |
2048 | |||
2049 | lines.push_back(line); | ||
2050 | } | 1080 | } |
2051 | 1081 | ||
2052 | progress ppgs("Writing pertainyms and mannernyms...", lines.size()); | 1082 | void generator::readWordNetSimilarity() |
2053 | for (auto line : lines) | ||
2054 | { | 1083 | { |
2055 | ppgs.update(); | 1084 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); |
2056 | 1085 | progress ppgs("Writing adjective similarity...", lines.size()); | |
2057 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | 1086 | for (auto line : lines) |
2058 | std::smatch relation_data; | ||
2059 | if (!std::regex_search(line, relation_data, relation)) | ||
2060 | { | 1087 | { |
2061 | continue; | 1088 | ppgs.update(); |
2062 | } | ||
2063 | 1089 | ||
2064 | int synset_id_1 = stoi(relation_data[1]); | 1090 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); |
2065 | int wnum_1 = stoi(relation_data[2]); | 1091 | std::smatch relation_data; |
2066 | int synset_id_2 = stoi(relation_data[3]); | 1092 | if (!std::regex_search(line, relation_data, relation)) |
2067 | int wnum_2 = stoi(relation_data[4]); | ||
2068 | std::string query; | ||
2069 | switch (synset_id_1 / 100000000) | ||
2070 | { | ||
2071 | case 3: // Adjective | ||
2072 | { | 1093 | { |
2073 | // This is a pertainym, the second word should be a noun | 1094 | continue; |
2074 | // Technically it can be an adjective but we're ignoring that | ||
2075 | if (synset_id_2 / 100000000 != 1) | ||
2076 | { | ||
2077 | continue; | ||
2078 | } | ||
2079 | |||
2080 | query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; | ||
2081 | |||
2082 | break; | ||
2083 | } | 1095 | } |
1096 | |||
1097 | int lookup1 = std::stoi(relation_data[1]); | ||
1098 | int lookup2 = std::stoi(relation_data[2]); | ||
2084 | 1099 | ||
2085 | case 4: // Adverb | 1100 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
2086 | { | 1101 | { |
2087 | // This is a mannernym, the second word should be an adjective | 1102 | notion& notion1 = *notionByWnid_.at(lookup1); |
2088 | if (synset_id_2 / 100000000 != 3) | 1103 | notion& notion2 = *notionByWnid_.at(lookup2); |
2089 | { | ||
2090 | continue; | ||
2091 | } | ||
2092 | 1104 | ||
2093 | query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; | 1105 | std::list<field> fields; |
1106 | fields.emplace_back("adjective_1_id", notion1.getId()); | ||
1107 | fields.emplace_back("adjective_2_id", notion2.getId()); | ||
2094 | 1108 | ||
2095 | break; | 1109 | db_.insertIntoTable("similarity", std::move(fields)); |
2096 | } | 1110 | } |
2097 | } | 1111 | } |
2098 | 1112 | } | |
2099 | sqlite3_stmt* ppstmt; | ||
2100 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2101 | { | ||
2102 | db_error(ppdb, query); | ||
2103 | } | ||
2104 | |||
2105 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
2106 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
2107 | 1113 | ||
2108 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1114 | std::list<std::string> generator::readFile(std::string path) |
1115 | { | ||
1116 | std::ifstream file(path); | ||
1117 | if (!file) | ||
2109 | { | 1118 | { |
2110 | db_error(ppdb, query); | 1119 | throw std::invalid_argument("Could not find file " + path); |
2111 | } | 1120 | } |
2112 | |||
2113 | sqlite3_finalize(ppstmt); | ||
2114 | } | ||
2115 | } | ||
2116 | 1121 | ||
2117 | // sa table | 1122 | std::list<std::string> lines; |
2118 | { | ||
2119 | std::ifstream wnsafile(wnpref + "wn_sa.pl"); | ||
2120 | if (!wnsafile.is_open()) | ||
2121 | { | ||
2122 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2123 | print_usage(); | ||
2124 | } | ||
2125 | |||
2126 | std::list<std::string> lines; | ||
2127 | for (;;) | ||
2128 | { | ||
2129 | std::string line; | 1123 | std::string line; |
2130 | if (!getline(wnsafile, line)) | 1124 | while (std::getline(file, line)) |
2131 | { | ||
2132 | break; | ||
2133 | } | ||
2134 | |||
2135 | if (line.back() == '\r') | ||
2136 | { | 1125 | { |
2137 | line.pop_back(); | 1126 | if (line.back() == '\r') |
1127 | { | ||
1128 | line.pop_back(); | ||
1129 | } | ||
1130 | |||
1131 | lines.push_back(line); | ||
2138 | } | 1132 | } |
2139 | 1133 | ||
2140 | lines.push_back(line); | 1134 | return lines; |
2141 | } | 1135 | } |
2142 | 1136 | ||
2143 | progress ppgs("Writing specifications...", lines.size()); | 1137 | part_of_speech generator::partOfSpeechByWnid(int wnid) |
2144 | for (auto line : lines) | ||
2145 | { | 1138 | { |
2146 | ppgs.update(); | 1139 | switch (wnid / 100000000) |
2147 | |||
2148 | std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); | ||
2149 | std::smatch relation_data; | ||
2150 | if (!std::regex_search(line, relation_data, relation)) | ||
2151 | { | ||
2152 | continue; | ||
2153 | } | ||
2154 | |||
2155 | int synset_id_1 = stoi(relation_data[1]); | ||
2156 | int wnum_1 = stoi(relation_data[2]); | ||
2157 | int synset_id_2 = stoi(relation_data[3]); | ||
2158 | int wnum_2 = stoi(relation_data[4]); | ||
2159 | std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); | ||
2160 | |||
2161 | sqlite3_stmt* ppstmt; | ||
2162 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2163 | { | 1140 | { |
2164 | db_error(ppdb, query); | 1141 | case 1: return part_of_speech::noun; |
1142 | case 2: return part_of_speech::verb; | ||
1143 | case 3: return part_of_speech::adjective; | ||
1144 | case 4: return part_of_speech::adverb; | ||
1145 | default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); | ||
2165 | } | 1146 | } |
1147 | } | ||
2166 | 1148 | ||
2167 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | 1149 | notion& generator::createNotion(part_of_speech partOfSpeech) |
2168 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | 1150 | { |
1151 | notions_.emplace_back(partOfSpeech); | ||
1152 | |||
1153 | return notions_.back(); | ||
1154 | } | ||
2169 | 1155 | ||
2170 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1156 | notion& generator::lookupOrCreateNotion(int wnid) |
1157 | { | ||
1158 | if (!notionByWnid_.count(wnid)) | ||
2171 | { | 1159 | { |
2172 | db_error(ppdb, query); | 1160 | notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); |
1161 | notionByWnid_[wnid] = ¬ions_.back(); | ||
2173 | } | 1162 | } |
2174 | 1163 | ||
2175 | sqlite3_finalize(ppstmt); | 1164 | return *notionByWnid_.at(wnid); |
2176 | } | ||
2177 | } | ||
2178 | |||
2179 | // sim table | ||
2180 | { | ||
2181 | std::ifstream wnsimfile(wnpref + "wn_sim.pl"); | ||
2182 | if (!wnsimfile.is_open()) | ||
2183 | { | ||
2184 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2185 | print_usage(); | ||
2186 | } | 1165 | } |
2187 | 1166 | ||
2188 | std::list<std::string> lines; | 1167 | lemma& generator::lookupOrCreateLemma(std::string base_form) |
2189 | for (;;) | ||
2190 | { | 1168 | { |
2191 | std::string line; | 1169 | if (!lemmaByBaseForm_.count(base_form)) |
2192 | if (!getline(wnsimfile, line)) | ||
2193 | { | 1170 | { |
2194 | break; | 1171 | lemmas_.emplace_back(lookupOrCreateForm(base_form)); |
1172 | lemmaByBaseForm_[base_form] = &lemmas_.back(); | ||
2195 | } | 1173 | } |
1174 | |||
1175 | return *lemmaByBaseForm_.at(base_form); | ||
1176 | } | ||
2196 | 1177 | ||
2197 | if (line.back() == '\r') | 1178 | form& generator::lookupOrCreateForm(std::string text) |
1179 | { | ||
1180 | if (!formByText_.count(text)) | ||
2198 | { | 1181 | { |
2199 | line.pop_back(); | 1182 | forms_.emplace_back(text); |
1183 | formByText_[text] = &forms_.back(); | ||
2200 | } | 1184 | } |
2201 | 1185 | ||
2202 | lines.push_back(line); | 1186 | return *formByText_[text]; |
2203 | } | 1187 | } |
2204 | 1188 | ||
2205 | progress ppgs("Writing sense synonyms...", lines.size()); | 1189 | template <typename... Args> word& generator::createWord(Args&&... args) |
2206 | for (auto line : lines) | ||
2207 | { | 1190 | { |
2208 | ppgs.update(); | 1191 | words_.emplace_back(std::forward<Args>(args)...); |
1192 | word& w = words_.back(); | ||
2209 | 1193 | ||
2210 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); | 1194 | wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); |
2211 | std::smatch relation_data; | 1195 | |
2212 | if (!std::regex_search(line, relation_data, relation)) | 1196 | if (w.getNotion().hasWnid()) |
2213 | { | 1197 | { |
2214 | continue; | 1198 | wordsByWnid_[w.getNotion().getWnid()].insert(&w); |
2215 | } | 1199 | } |
2216 | 1200 | ||
2217 | int synset_id_1 = stoi(relation_data[1]); | 1201 | return w; |
2218 | int synset_id_2 = stoi(relation_data[2]); | 1202 | } |
2219 | std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); | 1203 | |
1204 | group& generator::createGroup(xmlNodePtr top) | ||
1205 | { | ||
1206 | groups_.emplace_back(); | ||
1207 | group& grp = groups_.back(); | ||
2220 | 1208 | ||
2221 | for (auto mapping1 : wn[synset_id_1]) | 1209 | xmlChar* key; |
1210 | |||
1211 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
2222 | { | 1212 | { |
2223 | for (auto mapping2 : wn[synset_id_2]) | 1213 | if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES"))) |
2224 | { | 1214 | { |
2225 | sqlite3_stmt* ppstmt; | 1215 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) |
2226 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2227 | { | 1216 | { |
2228 | db_error(ppdb, query); | 1217 | if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS"))) |
1218 | { | ||
1219 | try | ||
1220 | { | ||
1221 | group& subgrp = createGroup(subclass); | ||
1222 | subgrp.setParent(grp); | ||
1223 | } catch (const std::exception& e) | ||
1224 | { | ||
1225 | key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID")); | ||
1226 | |||
1227 | if (key == nullptr) | ||
1228 | { | ||
1229 | std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); | ||
1230 | } else { | ||
1231 | std::string subgroupId(reinterpret_cast<const char*>(key)); | ||
1232 | xmlFree(key); | ||
1233 | |||
1234 | std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); | ||
1235 | } | ||
1236 | } | ||
1237 | } | ||
2229 | } | 1238 | } |
2230 | 1239 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS"))) | |
2231 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1240 | { |
2232 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1241 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) |
2233 | |||
2234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2235 | { | 1242 | { |
2236 | db_error(ppdb, query); | 1243 | if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER"))) |
1244 | { | ||
1245 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn")); | ||
1246 | std::string wnSenses(reinterpret_cast<const char*>(key)); | ||
1247 | xmlFree(key); | ||
1248 | |||
1249 | auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); | ||
1250 | if (!wnSenseKeys.empty()) | ||
1251 | { | ||
1252 | std::list<std::string> tempKeys; | ||
1253 | |||
1254 | std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { | ||
1255 | return sense + "::"; | ||
1256 | }); | ||
1257 | |||
1258 | std::list<std::string> filteredKeys; | ||
1259 | |||
1260 | std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { | ||
1261 | return !wnSenseKeys_.count(sense); | ||
1262 | }); | ||
1263 | |||
1264 | wnSenseKeys = std::move(filteredKeys); | ||
1265 | } | ||
1266 | |||
1267 | if (!wnSenseKeys.empty()) | ||
1268 | { | ||
1269 | for (std::string sense : wnSenseKeys) | ||
1270 | { | ||
1271 | word& wordSense = *wnSenseKeys_[sense]; | ||
1272 | wordSense.setVerbGroup(grp); | ||
1273 | } | ||
1274 | } else { | ||
1275 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name")); | ||
1276 | std::string memberName(reinterpret_cast<const char*>(key)); | ||
1277 | xmlFree(key); | ||
1278 | |||
1279 | notion& n = createNotion(part_of_speech::verb); | ||
1280 | lemma& l = lookupOrCreateLemma(memberName); | ||
1281 | word& w = createWord(n, l); | ||
1282 | |||
1283 | w.setVerbGroup(grp); | ||
1284 | } | ||
1285 | } | ||
2237 | } | 1286 | } |
2238 | 1287 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES"))) | |
2239 | sqlite3_reset(ppstmt); | 1288 | { |
2240 | sqlite3_clear_bindings(ppstmt); | 1289 | for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) |
2241 | |||
2242 | sqlite3_bind_int(ppstmt, 1, mapping2.second); | ||
2243 | sqlite3_bind_int(ppstmt, 2, mapping1.second); | ||
2244 | |||
2245 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2246 | { | 1290 | { |
2247 | db_error(ppdb, query); | 1291 | if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE"))) |
1292 | { | ||
1293 | role r; | ||
1294 | |||
1295 | key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type")); | ||
1296 | std::string roleName = reinterpret_cast<const char*>(key); | ||
1297 | xmlFree(key); | ||
1298 | |||
1299 | for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
1300 | { | ||
1301 | if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1302 | { | ||
1303 | r.setSelrestrs(parseSelrestr(rolenode)); | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | grp.addRole(roleName, std::move(r)); | ||
1308 | } | ||
2248 | } | 1309 | } |
1310 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES"))) | ||
1311 | { | ||
1312 | for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) | ||
1313 | { | ||
1314 | if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME"))) | ||
1315 | { | ||
1316 | frames_.emplace_back(); | ||
1317 | frame& fr = frames_.back(); | ||
2249 | 1318 | ||
2250 | sqlite3_finalize(ppstmt); | 1319 | for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) |
1320 | { | ||
1321 | if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX"))) | ||
1322 | { | ||
1323 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
1324 | { | ||
1325 | if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP"))) | ||
1326 | { | ||
1327 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1328 | std::string partRole = reinterpret_cast<const char*>(key); | ||
1329 | xmlFree(key); | ||
1330 | |||
1331 | selrestr partSelrestrs; | ||
1332 | std::set<std::string> partSynrestrs; | ||
1333 | |||
1334 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1335 | { | ||
1336 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS"))) | ||
1337 | { | ||
1338 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1339 | { | ||
1340 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR"))) | ||
1341 | { | ||
1342 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1343 | partSynrestrs.insert(reinterpret_cast<const char*>(key)); | ||
1344 | xmlFree(key); | ||
1345 | } | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1350 | { | ||
1351 | partSelrestrs = parseSelrestr(npnode); | ||
1352 | } | ||
1353 | } | ||
1354 | |||
1355 | fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); | ||
1356 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB"))) | ||
1357 | { | ||
1358 | fr.push_back(part::createVerb()); | ||
1359 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP"))) | ||
1360 | { | ||
1361 | std::set<std::string> partChoices; | ||
1362 | bool partLiteral; | ||
1363 | |||
1364 | if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"))) | ||
1365 | { | ||
1366 | partLiteral = true; | ||
1367 | |||
1368 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1369 | std::string choicesStr = reinterpret_cast<const char*>(key); | ||
1370 | xmlFree(key); | ||
1371 | |||
1372 | split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); | ||
1373 | } else { | ||
1374 | partLiteral = false; | ||
1375 | |||
1376 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1377 | { | ||
1378 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1379 | { | ||
1380 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1381 | { | ||
1382 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1383 | { | ||
1384 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1385 | partChoices.insert(reinterpret_cast<const char*>(key)); | ||
1386 | xmlFree(key); | ||
1387 | } | ||
1388 | } | ||
1389 | } | ||
1390 | } | ||
1391 | } | ||
1392 | |||
1393 | fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); | ||
1394 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ"))) | ||
1395 | { | ||
1396 | fr.push_back(part::createAdjective()); | ||
1397 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV"))) | ||
1398 | { | ||
1399 | fr.push_back(part::createAdverb()); | ||
1400 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX"))) | ||
1401 | { | ||
1402 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1403 | std::string literalValue = reinterpret_cast<const char*>(key); | ||
1404 | xmlFree(key); | ||
1405 | |||
1406 | fr.push_back(part::createLiteral(literalValue)); | ||
1407 | } else { | ||
1408 | continue; | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | grp.addFrame(fr); | ||
1413 | } | ||
1414 | } | ||
1415 | } | ||
1416 | } | ||
2251 | } | 1417 | } |
2252 | } | 1418 | } |
2253 | } | ||
2254 | } | ||
2255 | |||
2256 | // syntax table | ||
2257 | { | ||
2258 | std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); | ||
2259 | if (!wnsyntaxfile.is_open()) | ||
2260 | { | ||
2261 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2262 | print_usage(); | ||
2263 | } | ||
2264 | 1419 | ||
2265 | std::list<std::string> lines; | 1420 | return grp; |
2266 | for (;;) | ||
2267 | { | ||
2268 | std::string line; | ||
2269 | if (!getline(wnsyntaxfile, line)) | ||
2270 | { | ||
2271 | break; | ||
2272 | } | ||
2273 | |||
2274 | if (line.back() == '\r') | ||
2275 | { | ||
2276 | line.pop_back(); | ||
2277 | } | ||
2278 | |||
2279 | lines.push_back(line); | ||
2280 | } | 1421 | } |
2281 | 1422 | ||
2282 | progress ppgs("Writing adjective syntax markers...", lines.size()); | 1423 | selrestr generator::parseSelrestr(xmlNodePtr top) |
2283 | for (auto line : lines) | ||
2284 | { | 1424 | { |
2285 | ppgs.update(); | 1425 | xmlChar* key; |
2286 | 1426 | ||
2287 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); | 1427 | if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) |
2288 | std::smatch relation_data; | ||
2289 | if (!std::regex_search(line, relation_data, relation)) | ||
2290 | { | ||
2291 | continue; | ||
2292 | } | ||
2293 | |||
2294 | int synset_id = stoi(relation_data[1]); | ||
2295 | int wnum = stoi(relation_data[2]); | ||
2296 | std::string syn = relation_data[3]; | ||
2297 | std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); | ||
2298 | |||
2299 | sqlite3_stmt* ppstmt; | ||
2300 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2301 | { | 1428 | { |
2302 | db_error(ppdb, query); | 1429 | if (xmlChildElementCount(top) == 0) |
2303 | } | 1430 | { |
2304 | 1431 | return {}; | |
2305 | sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); | 1432 | } else if (xmlChildElementCount(top) == 1) |
2306 | sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); | 1433 | { |
2307 | 1434 | return parseSelrestr(xmlFirstElementChild(top)); | |
2308 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1435 | } else { |
1436 | bool orlogic = false; | ||
1437 | if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic"))) | ||
1438 | { | ||
1439 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic")); | ||
1440 | if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or"))) | ||
1441 | { | ||
1442 | orlogic = true; | ||
1443 | } | ||
1444 | |||
1445 | xmlFree(key); | ||
1446 | } | ||
1447 | |||
1448 | std::list<selrestr> children; | ||
1449 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | ||
1450 | { | ||
1451 | if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS")) | ||
1452 | || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1453 | { | ||
1454 | children.push_back(parseSelrestr(selrestr)); | ||
1455 | } | ||
1456 | } | ||
1457 | |||
1458 | return selrestr(children, orlogic); | ||
1459 | } | ||
1460 | } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
2309 | { | 1461 | { |
2310 | db_error(ppdb, query); | 1462 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value")); |
1463 | bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+"); | ||
1464 | xmlFree(key); | ||
1465 | |||
1466 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type")); | ||
1467 | std::string selRestriction = reinterpret_cast<const char*>(key); | ||
1468 | xmlFree(key); | ||
1469 | |||
1470 | return selrestr(selRestriction, selPos); | ||
1471 | } else { | ||
1472 | throw std::logic_error("Badly formatted selrestr"); | ||
2311 | } | 1473 | } |
2312 | |||
2313 | sqlite3_finalize(ppstmt); | ||
2314 | } | 1474 | } |
2315 | } | 1475 | |
2316 | 1476 | }; | |
2317 | sqlite3_close_v2(ppdb); | 1477 | }; |
2318 | |||
2319 | std::cout << "Done." << std::endl; | ||
2320 | } | ||