diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-10 21:34:55 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-10 21:34:55 -0500 |
commit | e1be2716746e75cf6ed37e86461a7f580a964564 (patch) | |
tree | 38a69a8cbd690f27f1ee8c2ce43eeb0333753d52 /generator.cpp | |
parent | 41decb9a671e4d0fbbe12533372435ec6ede2246 (diff) | |
download | furries-e1be2716746e75cf6ed37e86461a7f580a964564.tar.gz furries-e1be2716746e75cf6ed37e86461a7f580a964564.tar.bz2 furries-e1be2716746e75cf6ed37e86461a7f580a964564.zip |
Started implementing verbly data generator
Currently, the generator: - Uses AGID to create entries for verb words and their inflections - Uses WordNet to create entries for adjective, adverb, and noun senses
Diffstat (limited to 'generator.cpp')
-rw-r--r-- | generator.cpp | 451 |
1 files changed, 451 insertions, 0 deletions
diff --git a/generator.cpp b/generator.cpp new file mode 100644 index 0000000..c389963 --- /dev/null +++ b/generator.cpp | |||
@@ -0,0 +1,451 @@ | |||
1 | #include <libxml/parser.h> | ||
2 | #include <iostream> | ||
3 | #include <dirent.h> | ||
4 | #include <set> | ||
5 | #include <map> | ||
6 | #include <string> | ||
7 | #include <vector> | ||
8 | #include <fstream> | ||
9 | #include <sqlite3.h> | ||
10 | #include <sstream> | ||
11 | #include <regex> | ||
12 | |||
13 | struct verb { | ||
14 | std::string infinitive; | ||
15 | std::string past_tense; | ||
16 | std::string past_participle; | ||
17 | std::string ing_form; | ||
18 | std::string s_form; | ||
19 | }; | ||
20 | |||
21 | struct group { | ||
22 | std::string id; | ||
23 | std::set<std::string> members; | ||
24 | }; | ||
25 | |||
26 | std::map<std::string, group> groups; | ||
27 | std::map<std::string, verb> verbs; | ||
28 | std::map<int, std::map<int, int>> wn; | ||
29 | |||
30 | void print_usage() | ||
31 | { | ||
32 | std::cout << "Verbly Datafile Generator" << std::endl; | ||
33 | std::cout << "-------------------------" << std::endl; | ||
34 | std::cout << "Requires exactly four arguments." << std::endl; | ||
35 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | ||
36 | std::cout << "2. The path to a SemLink vnpbMappings file." << std::endl; | ||
37 | std::cout << "3. The path to an AGID infl.txt file." << std::endl; | ||
38 | std::cout << "4. The path to a WordNet prolog data directory." << std::endl; | ||
39 | std::cout << "5. Datafile output path." << std::endl; | ||
40 | |||
41 | exit(1); | ||
42 | } | ||
43 | /* | ||
44 | void parse_group(xmlNodePtr top, std::string filename) | ||
45 | { | ||
46 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | ||
47 | if (key == 0) | ||
48 | { | ||
49 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
50 | print_usage(); | ||
51 | } | ||
52 | std::string vnid = key; | ||
53 | vnid = vnid.substr(vnid.find_first_of("-")+1); | ||
54 | xmlFree(key); | ||
55 | |||
56 | group g; | ||
57 | g.id = vnid; | ||
58 | |||
59 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
60 | { | ||
61 | if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | ||
62 | { | ||
63 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | ||
64 | { | ||
65 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | ||
66 | { | ||
67 | key = xmlGetProp(member, (xmlChar*) "name"); | ||
68 | g.members.insert(key); | ||
69 | xmlFree(key); | ||
70 | } | ||
71 | } | ||
72 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | ||
73 | { | ||
74 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | ||
75 | { | ||
76 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | ||
77 | { | ||
78 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | ||
79 | { | ||
80 | |||
81 | } | ||
82 | } | ||
83 | } | ||
84 | } | ||
85 | } | ||
86 | }*/ | ||
87 | |||
88 | int main(int argc, char** argv) | ||
89 | { | ||
90 | if (argc != 6) | ||
91 | { | ||
92 | print_usage(); | ||
93 | } | ||
94 | |||
95 | /*DIR* dir; | ||
96 | if ((dir = opendir(argv[1])) == nullptr) | ||
97 | { | ||
98 | std::cout << "Invalid VerbNet data directory." << std::endl; | ||
99 | |||
100 | print_usage(); | ||
101 | } | ||
102 | |||
103 | struct dirent* ent; | ||
104 | while ((ent = readdir(dir)) != nullptr) | ||
105 | { | ||
106 | std::string filename(argv[1]); | ||
107 | if (filename.back() != '/') | ||
108 | { | ||
109 | filename += '/'; | ||
110 | } | ||
111 | |||
112 | filename += ent->d_name; | ||
113 | //std::cout << ent->d_name << std::endl; | ||
114 | |||
115 | if (filename.rfind(".xml") != filename.size() - 4) | ||
116 | { | ||
117 | continue; | ||
118 | } | ||
119 | |||
120 | xmlDocPtr doc = xmlParseFile(filename.c_str()); | ||
121 | if (doc == nullptr) | ||
122 | { | ||
123 | std::cout << "Error opening " << filename << std::endl; | ||
124 | print_usage(); | ||
125 | } | ||
126 | |||
127 | xmlNodePtr top = xmlDocGetRootElement(doc); | ||
128 | if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) | ||
129 | { | ||
130 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
131 | print_usage(); | ||
132 | } | ||
133 | |||
134 | parse_group(top, filename); | ||
135 | } | ||
136 | |||
137 | closedir(dir);*/ | ||
138 | |||
139 | // Get verbs from AGID | ||
140 | std::cout << "Reading verb inflection..." << std::endl; | ||
141 | |||
142 | std::ifstream agidfile(argv[3]); | ||
143 | if (!agidfile.is_open()) | ||
144 | { | ||
145 | std::cout << "Could not open AGID file: " << argv[3] << std::endl; | ||
146 | print_usage(); | ||
147 | } | ||
148 | |||
149 | for (;;) | ||
150 | { | ||
151 | std::string line; | ||
152 | if (!getline(agidfile, line)) | ||
153 | { | ||
154 | break; | ||
155 | } | ||
156 | |||
157 | if (line.back() == '\r') | ||
158 | { | ||
159 | line.pop_back(); | ||
160 | } | ||
161 | |||
162 | int divider = line.find_first_of(" "); | ||
163 | std::string word = line.substr(0, divider); | ||
164 | line = line.substr(divider+1); | ||
165 | |||
166 | if (line[0] != 'V') | ||
167 | { | ||
168 | continue; | ||
169 | } | ||
170 | |||
171 | if (line[1] == '?') | ||
172 | { | ||
173 | line.erase(0, 4); | ||
174 | } else { | ||
175 | line.erase(0, 3); | ||
176 | } | ||
177 | |||
178 | std::vector<std::string> forms; | ||
179 | while (!line.empty()) | ||
180 | { | ||
181 | std::string inflection; | ||
182 | if ((divider = line.find(" | ")) != std::string::npos) | ||
183 | { | ||
184 | inflection = line.substr(0, divider); | ||
185 | line = line.substr(divider + 3); | ||
186 | } else { | ||
187 | inflection = line; | ||
188 | line = ""; | ||
189 | } | ||
190 | |||
191 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | ||
192 | { | ||
193 | inflection = inflection.substr(0, divider); | ||
194 | } | ||
195 | |||
196 | forms.push_back(inflection); | ||
197 | } | ||
198 | |||
199 | verb v; | ||
200 | v.infinitive = word; | ||
201 | if (forms.size() == 4) | ||
202 | { | ||
203 | v.past_tense = forms[0]; | ||
204 | v.past_participle = forms[1]; | ||
205 | v.ing_form = forms[2]; | ||
206 | v.s_form = forms[3]; | ||
207 | } else if (forms.size() == 3) | ||
208 | { | ||
209 | v.past_tense = forms[0]; | ||
210 | v.past_participle = forms[0]; | ||
211 | v.ing_form = forms[1]; | ||
212 | v.s_form = forms[2]; | ||
213 | } else if (forms.size() == 8) | ||
214 | { | ||
215 | // As of AGID 2014.08.11, this is only "to be" | ||
216 | v.past_tense = forms[0]; | ||
217 | v.past_participle = forms[2]; | ||
218 | v.ing_form = forms[3]; | ||
219 | v.s_form = forms[4]; | ||
220 | } else { | ||
221 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
222 | // - may and shall do not conjugate the way we want them to | ||
223 | // - methinks only has a past tense and is an outlier | ||
224 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
225 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
226 | } | ||
227 | |||
228 | verbs[word] = v; | ||
229 | } | ||
230 | |||
231 | // Start writing output | ||
232 | std::cout << "Writing output..." << std::endl; | ||
233 | |||
234 | sqlite3* ppdb; | ||
235 | if (sqlite3_open_v2(argv[5], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
236 | { | ||
237 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | ||
238 | print_usage(); | ||
239 | } | ||
240 | |||
241 | std::ifstream schemafile("schema.sql"); | ||
242 | if (!schemafile.is_open()) | ||
243 | { | ||
244 | std::cout << "Could not find schema file" << std::endl; | ||
245 | print_usage(); | ||
246 | } | ||
247 | |||
248 | std::stringstream schemabuilder; | ||
249 | for (;;) | ||
250 | { | ||
251 | std::string line; | ||
252 | if (!getline(schemafile, line)) | ||
253 | { | ||
254 | break; | ||
255 | } | ||
256 | |||
257 | if (line.back() == '\r') | ||
258 | { | ||
259 | line.pop_back(); | ||
260 | } | ||
261 | |||
262 | schemabuilder << line << std::endl; | ||
263 | } | ||
264 | |||
265 | std::string schema = schemabuilder.str(); | ||
266 | while (!schema.empty()) | ||
267 | { | ||
268 | std::string query; | ||
269 | int divider = schema.find(";"); | ||
270 | if (divider != std::string::npos) | ||
271 | { | ||
272 | query = schema.substr(0, divider+1); | ||
273 | schema = schema.substr(divider+2); | ||
274 | } else { | ||
275 | break; | ||
276 | } | ||
277 | |||
278 | sqlite3_stmt* schmstmt; | ||
279 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | ||
280 | { | ||
281 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
282 | sqlite3_close_v2(ppdb); | ||
283 | print_usage(); | ||
284 | } | ||
285 | |||
286 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | ||
287 | { | ||
288 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
289 | sqlite3_close_v2(ppdb); | ||
290 | print_usage(); | ||
291 | } | ||
292 | |||
293 | sqlite3_finalize(schmstmt); | ||
294 | } | ||
295 | |||
296 | std::cout << "Writing verbs..." << std::endl; | ||
297 | for (auto& mapping : verbs) | ||
298 | { | ||
299 | sqlite3_stmt* ppstmt; | ||
300 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | ||
301 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
302 | { | ||
303 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
304 | sqlite3_close_v2(ppdb); | ||
305 | print_usage(); | ||
306 | } | ||
307 | |||
308 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_STATIC); | ||
309 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_STATIC); | ||
310 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_STATIC); | ||
311 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_STATIC); | ||
312 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_STATIC); | ||
313 | |||
314 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
315 | { | ||
316 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
317 | sqlite3_close_v2(ppdb); | ||
318 | print_usage(); | ||
319 | } | ||
320 | |||
321 | sqlite3_finalize(ppstmt); | ||
322 | } | ||
323 | |||
324 | // Get nouns/adjectives/adverbs from WordNet | ||
325 | // Useful relations: | ||
326 | // - s: master list | ||
327 | // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) | ||
328 | // - at: variation (e.g. a measurement can be standard or nonstandard) | ||
329 | // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) | ||
330 | // - ins: instantiation (do we need this? let's see) | ||
331 | // - mm: member meronymy/holonymy (e.g. family/mother, family/child) | ||
332 | // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) | ||
333 | // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) | ||
334 | // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) | ||
335 | // mannernymy (e.g. something done quickly is done in a manner that is quick) | ||
336 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | ||
337 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | ||
338 | // - syntax: positioning flags for some adjectives | ||
339 | std::string wnpref {argv[4]}; | ||
340 | if (wnpref.back() != '/') | ||
341 | { | ||
342 | wnpref += '/'; | ||
343 | } | ||
344 | |||
345 | std::cout << "Reading words from WordNet..." << std::endl; | ||
346 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
347 | if (!wnsfile.is_open()) | ||
348 | { | ||
349 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
350 | print_usage(); | ||
351 | } | ||
352 | |||
353 | for (;;) | ||
354 | { | ||
355 | std::string line; | ||
356 | if (!getline(wnsfile, line)) | ||
357 | { | ||
358 | break; | ||
359 | } | ||
360 | |||
361 | if (line.back() == '\r') | ||
362 | { | ||
363 | line.pop_back(); | ||
364 | } | ||
365 | |||
366 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'([\\w ]+)',"); | ||
367 | std::smatch relation_data; | ||
368 | if (!std::regex_search(line, relation_data, relation)) | ||
369 | { | ||
370 | continue; | ||
371 | } | ||
372 | |||
373 | int synset_id = stoi(relation_data[1]); | ||
374 | int wnum = stoi(relation_data[2]); | ||
375 | std::string word = relation_data[3]; | ||
376 | |||
377 | std::string query; | ||
378 | switch (synset_id / 100000000) | ||
379 | { | ||
380 | case 1: // Noun | ||
381 | { | ||
382 | query = "INSERT INTO nouns (form) VALUES (?)"; | ||
383 | |||
384 | break; | ||
385 | } | ||
386 | |||
387 | case 2: // Verb | ||
388 | { | ||
389 | // Ignore | ||
390 | |||
391 | break; | ||
392 | } | ||
393 | |||
394 | case 3: // Adjective | ||
395 | { | ||
396 | query = "INSERT INTO adjectives (form) VALUES (?)"; | ||
397 | |||
398 | break; | ||
399 | } | ||
400 | |||
401 | case 4: // Adverb | ||
402 | { | ||
403 | query = "INSERT INTO adverbs (form) VALUES (?)"; | ||
404 | |||
405 | break; | ||
406 | } | ||
407 | } | ||
408 | |||
409 | sqlite3_stmt* ppstmt; | ||
410 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
411 | { | ||
412 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
413 | sqlite3_close_v2(ppdb); | ||
414 | print_usage(); | ||
415 | } | ||
416 | |||
417 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_STATIC); | ||
418 | |||
419 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
420 | { | ||
421 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
422 | sqlite3_close_v2(ppdb); | ||
423 | print_usage(); | ||
424 | } | ||
425 | |||
426 | sqlite3_finalize(ppstmt); | ||
427 | |||
428 | query = "SELECT last_insert_rowid()"; | ||
429 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
430 | { | ||
431 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
432 | sqlite3_close_v2(ppdb); | ||
433 | print_usage(); | ||
434 | } | ||
435 | |||
436 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
437 | { | ||
438 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
439 | sqlite3_close_v2(ppdb); | ||
440 | print_usage(); | ||
441 | } | ||
442 | |||
443 | wn[synset_id][wnum] = sqlite3_column_int(ppstmt, 0); | ||
444 | |||
445 | sqlite3_finalize(ppstmt); | ||
446 | } | ||
447 | |||
448 | sqlite3_close_v2(ppdb); | ||
449 | |||
450 | std::cout << "Done." << std::endl; | ||
451 | } \ No newline at end of file | ||