about summary refs log tree commit diff stats
path: root/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator.cpp')
-rw-r--r--generator.cpp451
1 files changed, 451 insertions, 0 deletions
diff --git a/generator.cpp b/generator.cpp new file mode 100644 index 0000000..c389963 --- /dev/null +++ b/generator.cpp
@@ -0,0 +1,451 @@
1#include <libxml/parser.h>
2#include <iostream>
3#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream>
9#include <sqlite3.h>
10#include <sstream>
11#include <regex>
12
13struct verb {
14 std::string infinitive;
15 std::string past_tense;
16 std::string past_participle;
17 std::string ing_form;
18 std::string s_form;
19};
20
21struct group {
22 std::string id;
23 std::set<std::string> members;
24};
25
26std::map<std::string, group> groups;
27std::map<std::string, verb> verbs;
28std::map<int, std::map<int, int>> wn;
29
30void print_usage()
31{
32 std::cout << "Verbly Datafile Generator" << std::endl;
33 std::cout << "-------------------------" << std::endl;
34 std::cout << "Requires exactly four arguments." << std::endl;
35 std::cout << "1. The path to a VerbNet data directory." << std::endl;
36 std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl;
37 std::cout << "3. The path to an AGID infl.txt file." << std::endl;
38 std::cout << "4. The path to a WordNet prolog data directory." << std::endl;
39 std::cout << "5. Datafile output path." << std::endl;
40
41 exit(1);
42}
43/*
44void parse_group(xmlNodePtr top, std::string filename)
45{
46 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
47 if (key == 0)
48 {
49 std::cout << "Bad VerbNet file format: " << filename << std::endl;
50 print_usage();
51 }
52 std::string vnid = key;
53 vnid = vnid.substr(vnid.find_first_of("-")+1);
54 xmlFree(key);
55
56 group g;
57 g.id = vnid;
58
59 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
60 {
61 if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS"))
62 {
63 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
64 {
65 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER"))
66 {
67 key = xmlGetProp(member, (xmlChar*) "name");
68 g.members.insert(key);
69 xmlFree(key);
70 }
71 }
72 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES"))
73 {
74 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next)
75 {
76 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME"))
77 {
78 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
79 {
80
81 }
82 }
83 }
84 }
85 }
86}*/
87
88int main(int argc, char** argv)
89{
90 if (argc != 6)
91 {
92 print_usage();
93 }
94
95 /*DIR* dir;
96 if ((dir = opendir(argv[1])) == nullptr)
97 {
98 std::cout << "Invalid VerbNet data directory." << std::endl;
99
100 print_usage();
101 }
102
103 struct dirent* ent;
104 while ((ent = readdir(dir)) != nullptr)
105 {
106 std::string filename(argv[1]);
107 if (filename.back() != '/')
108 {
109 filename += '/';
110 }
111
112 filename += ent->d_name;
113 //std::cout << ent->d_name << std::endl;
114
115 if (filename.rfind(".xml") != filename.size() - 4)
116 {
117 continue;
118 }
119
120 xmlDocPtr doc = xmlParseFile(filename.c_str());
121 if (doc == nullptr)
122 {
123 std::cout << "Error opening " << filename << std::endl;
124 print_usage();
125 }
126
127 xmlNodePtr top = xmlDocGetRootElement(doc);
128 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
129 {
130 std::cout << "Bad VerbNet file format: " << filename << std::endl;
131 print_usage();
132 }
133
134 parse_group(top, filename);
135 }
136
137 closedir(dir);*/
138
139 // Get verbs from AGID
140 std::cout << "Reading verb inflection..." << std::endl;
141
142 std::ifstream agidfile(argv[3]);
143 if (!agidfile.is_open())
144 {
145 std::cout << "Could not open AGID file: " << argv[3] << std::endl;
146 print_usage();
147 }
148
149 for (;;)
150 {
151 std::string line;
152 if (!getline(agidfile, line))
153 {
154 break;
155 }
156
157 if (line.back() == '\r')
158 {
159 line.pop_back();
160 }
161
162 int divider = line.find_first_of(" ");
163 std::string word = line.substr(0, divider);
164 line = line.substr(divider+1);
165
166 if (line[0] != 'V')
167 {
168 continue;
169 }
170
171 if (line[1] == '?')
172 {
173 line.erase(0, 4);
174 } else {
175 line.erase(0, 3);
176 }
177
178 std::vector<std::string> forms;
179 while (!line.empty())
180 {
181 std::string inflection;
182 if ((divider = line.find(" | ")) != std::string::npos)
183 {
184 inflection = line.substr(0, divider);
185 line = line.substr(divider + 3);
186 } else {
187 inflection = line;
188 line = "";
189 }
190
191 if ((divider = inflection.find_first_of(",?")) != std::string::npos)
192 {
193 inflection = inflection.substr(0, divider);
194 }
195
196 forms.push_back(inflection);
197 }
198
199 verb v;
200 v.infinitive = word;
201 if (forms.size() == 4)
202 {
203 v.past_tense = forms[0];
204 v.past_participle = forms[1];
205 v.ing_form = forms[2];
206 v.s_form = forms[3];
207 } else if (forms.size() == 3)
208 {
209 v.past_tense = forms[0];
210 v.past_participle = forms[0];
211 v.ing_form = forms[1];
212 v.s_form = forms[2];
213 } else if (forms.size() == 8)
214 {
215 // As of AGID 2014.08.11, this is only "to be"
216 v.past_tense = forms[0];
217 v.past_participle = forms[2];
218 v.ing_form = forms[3];
219 v.s_form = forms[4];
220 } else {
221 // Words that don't fit the cases above as of AGID 2014.08.11:
222 // - may and shall do not conjugate the way we want them to
223 // - methinks only has a past tense and is an outlier
224 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
225 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
226 }
227
228 verbs[word] = v;
229 }
230
231 // Start writing output
232 std::cout << "Writing output..." << std::endl;
233
234 sqlite3* ppdb;
235 if (sqlite3_open_v2(argv[5], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
236 {
237 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
238 print_usage();
239 }
240
241 std::ifstream schemafile("schema.sql");
242 if (!schemafile.is_open())
243 {
244 std::cout << "Could not find schema file" << std::endl;
245 print_usage();
246 }
247
248 std::stringstream schemabuilder;
249 for (;;)
250 {
251 std::string line;
252 if (!getline(schemafile, line))
253 {
254 break;
255 }
256
257 if (line.back() == '\r')
258 {
259 line.pop_back();
260 }
261
262 schemabuilder << line << std::endl;
263 }
264
265 std::string schema = schemabuilder.str();
266 while (!schema.empty())
267 {
268 std::string query;
269 int divider = schema.find(";");
270 if (divider != std::string::npos)
271 {
272 query = schema.substr(0, divider+1);
273 schema = schema.substr(divider+2);
274 } else {
275 break;
276 }
277
278 sqlite3_stmt* schmstmt;
279 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
280 {
281 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
282 sqlite3_close_v2(ppdb);
283 print_usage();
284 }
285
286 if (sqlite3_step(schmstmt) != SQLITE_DONE)
287 {
288 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
289 sqlite3_close_v2(ppdb);
290 print_usage();
291 }
292
293 sqlite3_finalize(schmstmt);
294 }
295
296 std::cout << "Writing verbs..." << std::endl;
297 for (auto& mapping : verbs)
298 {
299 sqlite3_stmt* ppstmt;
300 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
301 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
302 {
303 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
304 sqlite3_close_v2(ppdb);
305 print_usage();
306 }
307
308 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC);
309 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC);
310 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC);
311 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC);
312 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC);
313
314 if (sqlite3_step(ppstmt) != SQLITE_DONE)
315 {
316 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
317 sqlite3_close_v2(ppdb);
318 print_usage();
319 }
320
321 sqlite3_finalize(ppstmt);
322 }
323
324 // Get nouns/adjectives/adverbs from WordNet
325 // Useful relations:
326 // - s: master list
327 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
328 // - at: variation (e.g. a measurement can be standard or nonstandard)
329 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
330 // - ins: instantiation (do we need this? let's see)
331 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
332 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
333 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
334 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
335 // mannernymy (e.g. something done quickly is done in a manner that is quick)
336 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
337 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
338 // - syntax: positioning flags for some adjectives
339 std::string wnpref {argv[4]};
340 if (wnpref.back() != '/')
341 {
342 wnpref += '/';
343 }
344
345 std::cout << "Reading words from WordNet..." << std::endl;
346 std::ifstream wnsfile(wnpref + "wn_s.pl");
347 if (!wnsfile.is_open())
348 {
349 std::cout << "Invalid WordNet data directory." << std::endl;
350 print_usage();
351 }
352
353 for (;;)
354 {
355 std::string line;
356 if (!getline(wnsfile, line))
357 {
358 break;
359 }
360
361 if (line.back() == '\r')
362 {
363 line.pop_back();
364 }
365
366 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',");
367 std::smatch relation_data;
368 if (!std::regex_search(line, relation_data, relation))
369 {
370 continue;
371 }
372
373 int synset_id = stoi(relation_data[1]);
374 int wnum = stoi(relation_data[2]);
375 std::string word = relation_data[3];
376
377 std::string query;
378 switch (synset_id / 100000000)
379 {
380 case 1: // Noun
381 {
382 query = "INSERT INTO nouns (form) VALUES (?)";
383
384 break;
385 }
386
387 case 2: // Verb
388 {
389 // Ignore
390
391 break;
392 }
393
394 case 3: // Adjective
395 {
396 query = "INSERT INTO adjectives (form) VALUES (?)";
397
398 break;
399 }
400
401 case 4: // Adverb
402 {
403 query = "INSERT INTO adverbs (form) VALUES (?)";
404
405 break;
406 }
407 }
408
409 sqlite3_stmt* ppstmt;
410 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
411 {
412 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
413 sqlite3_close_v2(ppdb);
414 print_usage();
415 }
416
417 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC);
418
419 if (sqlite3_step(ppstmt) != SQLITE_DONE)
420 {
421 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
422 sqlite3_close_v2(ppdb);
423 print_usage();
424 }
425
426 sqlite3_finalize(ppstmt);
427
428 query = "SELECT last_insert_rowid()";
429 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
430 {
431 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
432 sqlite3_close_v2(ppdb);
433 print_usage();
434 }
435
436 if (sqlite3_step(ppstmt) != SQLITE_ROW)
437 {
438 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
439 sqlite3_close_v2(ppdb);
440 print_usage();
441 }
442
443 wn[synset_id][wnum] = sqlite3_column_int(ppstmt, 0);
444
445 sqlite3_finalize(ppstmt);
446 }
447
448 sqlite3_close_v2(ppdb);
449
450 std::cout << "Done." << std::endl;
451} \ No newline at end of file