diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-24 23:16:07 -0400 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-03-24 23:16:07 -0400 |
| commit | eef5de613c75661e5d94baa086f6f2ddc26c7ed0 (patch) | |
| tree | 180230f6a245c5bca94d894273f5d2b93ded3f04 /generator/generator.cpp | |
| parent | d5ee4e39e5b5b3b8daa85cd972802195ad35e965 (diff) | |
| download | verbly-eef5de613c75661e5d94baa086f6f2ddc26c7ed0.tar.gz verbly-eef5de613c75661e5d94baa086f6f2ddc26c7ed0.tar.bz2 verbly-eef5de613c75661e5d94baa086f6f2ddc26c7ed0.zip | |
Added verb frames
In addition: - Added prepositions. - Rewrote a lot of the query interface. It now, for a lot of relationships, supports nested AND, OR, and NOT logic. - Rewrote the token class. It is now a union-like class instead of being polymorphic, which means smart pointers are no longer necessary. - Querying with regards to word derivation has been temporarily removed. - Sentinel values are now supported for all word types. - The VerbNet data retrieved from http://verbs.colorado.edu/~mpalmer/projects/verbnet/downloads.html was found to not be perfectly satisfactory in some regards, especially regarding adjective phrases. A patch file is now included in the repository describing the changes made to the VerbNet v3.2 download for the canonical verbly datafile.
Diffstat (limited to 'generator/generator.cpp')
| -rw-r--r-- | generator/generator.cpp | 545 |
1 files changed, 523 insertions, 22 deletions
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ec94df..aea750c 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -11,36 +11,75 @@ | |||
| 11 | #include <regex> | 11 | #include <regex> |
| 12 | #include <list> | 12 | #include <list> |
| 13 | #include <algorithm> | 13 | #include <algorithm> |
| 14 | #include <json.hpp> | ||
| 14 | #include "progress.h" | 15 | #include "progress.h" |
| 16 | #include "../lib/util.h" | ||
| 15 | 17 | ||
| 16 | struct verb { | 18 | using json = nlohmann::json; |
| 19 | |||
| 20 | struct verb_t { | ||
| 17 | std::string infinitive; | 21 | std::string infinitive; |
| 18 | std::string past_tense; | 22 | std::string past_tense; |
| 19 | std::string past_participle; | 23 | std::string past_participle; |
| 20 | std::string ing_form; | 24 | std::string ing_form; |
| 21 | std::string s_form; | 25 | std::string s_form; |
| 26 | int id; | ||
| 22 | }; | 27 | }; |
| 23 | 28 | ||
| 24 | struct adjective { | 29 | struct adjective_t { |
| 25 | std::string base; | 30 | std::string base; |
| 26 | std::string comparative; | 31 | std::string comparative; |
| 27 | std::string superlative; | 32 | std::string superlative; |
| 28 | }; | 33 | }; |
| 29 | 34 | ||
| 30 | struct noun { | 35 | struct noun_t { |
| 31 | std::string singular; | 36 | std::string singular; |
| 32 | std::string plural; | 37 | std::string plural; |
| 33 | }; | 38 | }; |
| 34 | 39 | ||
| 35 | struct group { | 40 | struct selrestr_t { |
| 41 | enum class type_t { | ||
| 42 | singleton, | ||
| 43 | andlogic, | ||
| 44 | orlogic, | ||
| 45 | empty | ||
| 46 | }; | ||
| 47 | type_t type; | ||
| 48 | std::string restriction; | ||
| 49 | bool pos; | ||
| 50 | std::list<selrestr_t> subordinates; | ||
| 51 | }; | ||
| 52 | |||
| 53 | struct framepart_t { | ||
| 54 | enum class type_t { | ||
| 55 | np, | ||
| 56 | v, | ||
| 57 | pp, | ||
| 58 | adj, | ||
| 59 | adv, | ||
| 60 | lex | ||
| 61 | }; | ||
| 62 | type_t type; | ||
| 63 | std::string role; | ||
| 64 | selrestr_t selrestrs; | ||
| 65 | std::set<std::string> preprestrs; | ||
| 66 | std::set<std::string> synrestrs; | ||
| 67 | std::list<std::string> choices; | ||
| 68 | std::string lexval; | ||
| 69 | }; | ||
| 70 | |||
| 71 | struct group_t { | ||
| 36 | std::string id; | 72 | std::string id; |
| 73 | std::string parent; | ||
| 37 | std::set<std::string> members; | 74 | std::set<std::string> members; |
| 75 | std::map<std::string, selrestr_t> roles; | ||
| 76 | std::list<std::list<framepart_t>> frames; | ||
| 38 | }; | 77 | }; |
| 39 | 78 | ||
| 40 | std::map<std::string, group> groups; | 79 | std::map<std::string, group_t> groups; |
| 41 | std::map<std::string, verb> verbs; | 80 | std::map<std::string, verb_t> verbs; |
| 42 | std::map<std::string, adjective> adjectives; | 81 | std::map<std::string, adjective_t> adjectives; |
| 43 | std::map<std::string, noun> nouns; | 82 | std::map<std::string, noun_t> nouns; |
| 44 | std::map<int, std::map<int, int>> wn; | 83 | std::map<int, std::map<int, int>> wn; |
| 45 | std::map<std::string, std::set<std::string>> pronunciations; | 84 | std::map<std::string, std::set<std::string>> pronunciations; |
| 46 | 85 | ||
| @@ -59,15 +98,97 @@ void print_usage() | |||
| 59 | exit(1); | 98 | exit(1); |
| 60 | } | 99 | } |
| 61 | 100 | ||
| 62 | void db_error(sqlite3* ppdb, std::string) | 101 | void db_error(sqlite3* ppdb, std::string query) |
| 63 | { | 102 | { |
| 64 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | 103 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; |
| 104 | std::cout << query << std::endl; | ||
| 65 | sqlite3_close_v2(ppdb); | 105 | sqlite3_close_v2(ppdb); |
| 66 | print_usage(); | 106 | print_usage(); |
| 67 | } | 107 | } |
| 68 | 108 | ||
| 69 | /* | 109 | json export_selrestrs(selrestr_t r) |
| 70 | void parse_group(xmlNodePtr top, std::string filename) | 110 | { |
| 111 | if (r.type == selrestr_t::type_t::empty) | ||
| 112 | { | ||
| 113 | return {}; | ||
| 114 | } else if (r.type == selrestr_t::type_t::singleton) | ||
| 115 | { | ||
| 116 | json result; | ||
| 117 | result["type"] = r.restriction; | ||
| 118 | result["pos"] = r.pos; | ||
| 119 | return result; | ||
| 120 | } else { | ||
| 121 | json result; | ||
| 122 | if (r.type == selrestr_t::type_t::andlogic) | ||
| 123 | { | ||
| 124 | result["logic"] = "and"; | ||
| 125 | } else { | ||
| 126 | result["logic"] = "or"; | ||
| 127 | } | ||
| 128 | |||
| 129 | std::list<json> outlist; | ||
| 130 | std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); | ||
| 131 | result["children"] = outlist; | ||
| 132 | |||
| 133 | return result; | ||
| 134 | } | ||
| 135 | } | ||
| 136 | |||
| 137 | selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) | ||
| 138 | { | ||
| 139 | selrestr_t r; | ||
| 140 | xmlChar* key; | ||
| 141 | |||
| 142 | if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) | ||
| 143 | { | ||
| 144 | if (xmlChildElementCount(top) == 0) | ||
| 145 | { | ||
| 146 | r.type = selrestr_t::type_t::empty; | ||
| 147 | } else if (xmlChildElementCount(top) == 1) | ||
| 148 | { | ||
| 149 | r = parse_selrestrs(xmlFirstElementChild(top), filename); | ||
| 150 | } else { | ||
| 151 | r.type = selrestr_t::type_t::andlogic; | ||
| 152 | |||
| 153 | if (xmlHasProp(top, (const xmlChar*) "logic")) | ||
| 154 | { | ||
| 155 | key = xmlGetProp(top, (const xmlChar*) "logic"); | ||
| 156 | if (!xmlStrcmp(key, (const xmlChar*) "or")) | ||
| 157 | { | ||
| 158 | r.type = selrestr_t::type_t::orlogic; | ||
| 159 | } | ||
| 160 | xmlFree(key); | ||
| 161 | } | ||
| 162 | |||
| 163 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | ||
| 164 | { | ||
| 165 | if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) | ||
| 166 | { | ||
| 167 | r.subordinates.push_back(parse_selrestrs(selrestr, filename)); | ||
| 168 | } | ||
| 169 | } | ||
| 170 | } | ||
| 171 | } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) | ||
| 172 | { | ||
| 173 | r.type = selrestr_t::type_t::singleton; | ||
| 174 | |||
| 175 | key = xmlGetProp(top, (xmlChar*) "Value"); | ||
| 176 | r.pos = (std::string((const char*)key) == "+"); | ||
| 177 | xmlFree(key); | ||
| 178 | |||
| 179 | key = xmlGetProp(top, (xmlChar*) "type"); | ||
| 180 | r.restriction = (const char*) key; | ||
| 181 | xmlFree(key); | ||
| 182 | } else { | ||
| 183 | // Invalid | ||
| 184 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
| 185 | print_usage(); | ||
| 186 | } | ||
| 187 | |||
| 188 | return r; | ||
| 189 | } | ||
| 190 | |||
| 191 | group_t& parse_group(xmlNodePtr top, std::string filename) | ||
| 71 | { | 192 | { |
| 72 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | 193 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); |
| 73 | if (key == 0) | 194 | if (key == 0) |
| @@ -75,41 +196,183 @@ void parse_group(xmlNodePtr top, std::string filename) | |||
| 75 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | 196 | std::cout << "Bad VerbNet file format: " << filename << std::endl; |
| 76 | print_usage(); | 197 | print_usage(); |
| 77 | } | 198 | } |
| 78 | std::string vnid = key; | 199 | std::string vnid = (const char*)key; |
| 79 | vnid = vnid.substr(vnid.find_first_of("-")+1); | 200 | vnid = vnid.substr(vnid.find_first_of("-")+1); |
| 80 | xmlFree(key); | 201 | xmlFree(key); |
| 81 | 202 | ||
| 82 | group g; | 203 | group_t g; |
| 83 | g.id = vnid; | 204 | g.id = vnid; |
| 84 | 205 | ||
| 85 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | 206 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) |
| 86 | { | 207 | { |
| 87 | if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | 208 | if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) |
| 209 | { | ||
| 210 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) | ||
| 211 | { | ||
| 212 | if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) | ||
| 213 | { | ||
| 214 | auto& sg = parse_group(subclass, filename); | ||
| 215 | sg.parent = vnid; | ||
| 216 | |||
| 217 | for (auto member : sg.members) | ||
| 218 | { | ||
| 219 | g.members.insert(member); | ||
| 220 | } | ||
| 221 | |||
| 222 | // The schema requires that subclasses appear after role definitions, so we can do this now | ||
| 223 | for (auto role : g.roles) | ||
| 224 | { | ||
| 225 | if (sg.roles.count(role.first) == 0) | ||
| 226 | { | ||
| 227 | sg.roles[role.first] = role.second; | ||
| 228 | } | ||
| 229 | } | ||
| 230 | } | ||
| 231 | } | ||
| 232 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | ||
| 88 | { | 233 | { |
| 89 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | 234 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) |
| 90 | { | 235 | { |
| 91 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | 236 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) |
| 92 | { | 237 | { |
| 93 | key = xmlGetProp(member, (xmlChar*) "name"); | 238 | key = xmlGetProp(member, (xmlChar*) "name"); |
| 94 | g.members.insert(key); | 239 | g.members.insert((const char*)key); |
| 95 | xmlFree(key); | 240 | xmlFree(key); |
| 96 | } | 241 | } |
| 97 | } | 242 | } |
| 243 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) | ||
| 244 | { | ||
| 245 | for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) | ||
| 246 | { | ||
| 247 | if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) | ||
| 248 | { | ||
| 249 | selrestr_t r; | ||
| 250 | r.type = selrestr_t::type_t::empty; | ||
| 251 | |||
| 252 | key = xmlGetProp(role, (const xmlChar*) "type"); | ||
| 253 | std::string type = (const char*)key; | ||
| 254 | xmlFree(key); | ||
| 255 | |||
| 256 | for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
| 257 | { | ||
| 258 | if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) | ||
| 259 | { | ||
| 260 | r = parse_selrestrs(rolenode, filename); | ||
| 261 | } | ||
| 262 | } | ||
| 263 | |||
| 264 | g.roles[type] = r; | ||
| 265 | } | ||
| 266 | } | ||
| 98 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | 267 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) |
| 99 | { | 268 | { |
| 100 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | 269 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) |
| 101 | { | 270 | { |
| 102 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | 271 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) |
| 103 | { | 272 | { |
| 273 | std::list<framepart_t> f; | ||
| 274 | |||
| 104 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | 275 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) |
| 105 | { | 276 | { |
| 106 | 277 | if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) | |
| 278 | { | ||
| 279 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
| 280 | { | ||
| 281 | framepart_t fp; | ||
| 282 | |||
| 283 | if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) | ||
| 284 | { | ||
| 285 | fp.type = framepart_t::type_t::np; | ||
| 286 | |||
| 287 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 288 | fp.role = (const char*)key; | ||
| 289 | xmlFree(key); | ||
| 290 | |||
| 291 | fp.selrestrs.type = selrestr_t::type_t::empty; | ||
| 292 | |||
| 293 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 294 | { | ||
| 295 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) | ||
| 296 | { | ||
| 297 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 298 | { | ||
| 299 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) | ||
| 300 | { | ||
| 301 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
| 302 | fp.synrestrs.insert(std::string((const char*)key)); | ||
| 303 | xmlFree(key); | ||
| 304 | } | ||
| 305 | } | ||
| 306 | } | ||
| 307 | |||
| 308 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
| 309 | { | ||
| 310 | fp.selrestrs = parse_selrestrs(npnode, filename); | ||
| 311 | } | ||
| 312 | } | ||
| 313 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) | ||
| 314 | { | ||
| 315 | fp.type = framepart_t::type_t::v; | ||
| 316 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) | ||
| 317 | { | ||
| 318 | fp.type = framepart_t::type_t::pp; | ||
| 319 | |||
| 320 | if (xmlHasProp(syntaxnode, (xmlChar*) "value")) | ||
| 321 | { | ||
| 322 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 323 | std::string choices = (const char*)key; | ||
| 324 | xmlFree(key); | ||
| 325 | |||
| 326 | fp.choices = verbly::split<std::list<std::string>>(choices, " "); | ||
| 327 | } | ||
| 328 | |||
| 329 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 330 | { | ||
| 331 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
| 332 | { | ||
| 333 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 334 | { | ||
| 335 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) | ||
| 336 | { | ||
| 337 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
| 338 | fp.preprestrs.insert(std::string((const char*)key)); | ||
| 339 | xmlFree(key); | ||
| 340 | } | ||
| 341 | } | ||
| 342 | } | ||
| 343 | } | ||
| 344 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) | ||
| 345 | { | ||
| 346 | fp.type = framepart_t::type_t::adj; | ||
| 347 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) | ||
| 348 | { | ||
| 349 | fp.type = framepart_t::type_t::adv; | ||
| 350 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) | ||
| 351 | { | ||
| 352 | fp.type = framepart_t::type_t::lex; | ||
| 353 | |||
| 354 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 355 | fp.lexval = (const char*)key; | ||
| 356 | xmlFree(key); | ||
| 357 | } else { | ||
| 358 | continue; | ||
| 359 | } | ||
| 360 | |||
| 361 | f.push_back(fp); | ||
| 362 | } | ||
| 363 | |||
| 364 | g.frames.push_back(f); | ||
| 365 | } | ||
| 107 | } | 366 | } |
| 108 | } | 367 | } |
| 109 | } | 368 | } |
| 110 | } | 369 | } |
| 111 | } | 370 | } |
| 112 | }*/ | 371 | |
| 372 | groups[vnid] = g; | ||
| 373 | |||
| 374 | return groups[vnid]; | ||
| 375 | } | ||
| 113 | 376 | ||
| 114 | int main(int argc, char** argv) | 377 | int main(int argc, char** argv) |
| 115 | { | 378 | { |
| @@ -118,7 +381,10 @@ int main(int argc, char** argv) | |||
| 118 | print_usage(); | 381 | print_usage(); |
| 119 | } | 382 | } |
| 120 | 383 | ||
| 121 | /*DIR* dir; | 384 | // VerbNet data |
| 385 | std::cout << "Reading verb frames..." << std::endl; | ||
| 386 | |||
| 387 | DIR* dir; | ||
| 122 | if ((dir = opendir(argv[1])) == nullptr) | 388 | if ((dir = opendir(argv[1])) == nullptr) |
| 123 | { | 389 | { |
| 124 | std::cout << "Invalid VerbNet data directory." << std::endl; | 390 | std::cout << "Invalid VerbNet data directory." << std::endl; |
| @@ -160,7 +426,7 @@ int main(int argc, char** argv) | |||
| 160 | parse_group(top, filename); | 426 | parse_group(top, filename); |
| 161 | } | 427 | } |
| 162 | 428 | ||
| 163 | closedir(dir);*/ | 429 | closedir(dir); |
| 164 | 430 | ||
| 165 | // Get verbs from AGID | 431 | // Get verbs from AGID |
| 166 | std::cout << "Reading inflections..." << std::endl; | 432 | std::cout << "Reading inflections..." << std::endl; |
| @@ -222,7 +488,7 @@ int main(int argc, char** argv) | |||
| 222 | { | 488 | { |
| 223 | case 'V': | 489 | case 'V': |
| 224 | { | 490 | { |
| 225 | verb v; | 491 | verb_t v; |
| 226 | v.infinitive = word; | 492 | v.infinitive = word; |
| 227 | if (forms.size() == 4) | 493 | if (forms.size() == 4) |
| 228 | { | 494 | { |
| @@ -258,7 +524,7 @@ int main(int argc, char** argv) | |||
| 258 | 524 | ||
| 259 | case 'A': | 525 | case 'A': |
| 260 | { | 526 | { |
| 261 | adjective adj; | 527 | adjective_t adj; |
| 262 | adj.base = word; | 528 | adj.base = word; |
| 263 | if (forms.size() == 2) | 529 | if (forms.size() == 2) |
| 264 | { | 530 | { |
| @@ -276,7 +542,7 @@ int main(int argc, char** argv) | |||
| 276 | 542 | ||
| 277 | case 'N': | 543 | case 'N': |
| 278 | { | 544 | { |
| 279 | noun n; | 545 | noun_t n; |
| 280 | n.singular = word; | 546 | n.singular = word; |
| 281 | if (forms.size() == 1) | 547 | if (forms.size() == 1) |
| 282 | { | 548 | { |
| @@ -388,6 +654,85 @@ int main(int argc, char** argv) | |||
| 388 | sqlite3_finalize(schmstmt); | 654 | sqlite3_finalize(schmstmt); |
| 389 | } | 655 | } |
| 390 | 656 | ||
| 657 | std::cout << "Writing prepositions..." << std::endl; | ||
| 658 | std::ifstream prepfile("prepositions.txt"); | ||
| 659 | if (!prepfile.is_open()) | ||
| 660 | { | ||
| 661 | std::cout << "Could not find prepositions file" << std::endl; | ||
| 662 | print_usage(); | ||
| 663 | } | ||
| 664 | |||
| 665 | for (;;) | ||
| 666 | { | ||
| 667 | std::string line; | ||
| 668 | if (!getline(prepfile, line)) | ||
| 669 | { | ||
| 670 | break; | ||
| 671 | } | ||
| 672 | |||
| 673 | if (line.back() == '\r') | ||
| 674 | { | ||
| 675 | line.pop_back(); | ||
| 676 | } | ||
| 677 | |||
| 678 | std::regex relation("^([^:]+): (.+)"); | ||
| 679 | std::smatch relation_data; | ||
| 680 | std::regex_search(line, relation_data, relation); | ||
| 681 | std::string prep = relation_data[1]; | ||
| 682 | std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", "); | ||
| 683 | |||
| 684 | std::string query("INSERT INTO prepositions (form) VALUES (?)"); | ||
| 685 | sqlite3_stmt* ppstmt; | ||
| 686 | |||
| 687 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 688 | { | ||
| 689 | db_error(ppdb, query); | ||
| 690 | } | ||
| 691 | |||
| 692 | sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_STATIC); | ||
| 693 | |||
| 694 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 695 | { | ||
| 696 | db_error(ppdb, query); | ||
| 697 | } | ||
| 698 | |||
| 699 | sqlite3_finalize(ppstmt); | ||
| 700 | |||
| 701 | query = "SELECT last_insert_rowid()"; | ||
| 702 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 703 | { | ||
| 704 | db_error(ppdb, query); | ||
| 705 | } | ||
| 706 | |||
| 707 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
| 708 | { | ||
| 709 | db_error(ppdb, query); | ||
| 710 | } | ||
| 711 | |||
| 712 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
| 713 | sqlite3_finalize(ppstmt); | ||
| 714 | |||
| 715 | for (auto group : groups) | ||
| 716 | { | ||
| 717 | query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; | ||
| 718 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 719 | { | ||
| 720 | db_error(ppdb, query); | ||
| 721 | } | ||
| 722 | |||
| 723 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
| 724 | sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_STATIC); | ||
| 725 | |||
| 726 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 727 | { | ||
| 728 | db_error(ppdb, query); | ||
| 729 | } | ||
| 730 | |||
| 731 | sqlite3_finalize(ppstmt); | ||
| 732 | } | ||
| 733 | } | ||
| 734 | |||
| 735 | |||
| 391 | { | 736 | { |
| 392 | progress ppgs("Writing verbs...", verbs.size()); | 737 | progress ppgs("Writing verbs...", verbs.size()); |
| 393 | for (auto& mapping : verbs) | 738 | for (auto& mapping : verbs) |
| @@ -431,6 +776,8 @@ int main(int argc, char** argv) | |||
| 431 | 776 | ||
| 432 | sqlite3_finalize(ppstmt); | 777 | sqlite3_finalize(ppstmt); |
| 433 | 778 | ||
| 779 | mapping.second.id = rowid; | ||
| 780 | |||
| 434 | for (auto pronunciation : pronunciations[canonical]) | 781 | for (auto pronunciation : pronunciations[canonical]) |
| 435 | { | 782 | { |
| 436 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; | 783 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation) VALUES (?, ?)"; |
| @@ -455,6 +802,160 @@ int main(int argc, char** argv) | |||
| 455 | } | 802 | } |
| 456 | } | 803 | } |
| 457 | 804 | ||
| 805 | { | ||
| 806 | progress ppgs("Writing verb frames...", groups.size()); | ||
| 807 | for (auto& mapping : groups) | ||
| 808 | { | ||
| 809 | std::list<json> roledatal; | ||
| 810 | std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { | ||
| 811 | json role; | ||
| 812 | role["type"] = r.first; | ||
| 813 | role["selrestrs"] = export_selrestrs(r.second); | ||
| 814 | |||
| 815 | return role; | ||
| 816 | }); | ||
| 817 | |||
| 818 | json roledata(roledatal); | ||
| 819 | std::string rdm = roledata.dump(); | ||
| 820 | |||
| 821 | sqlite3_stmt* ppstmt; | ||
| 822 | std::string query("INSERT INTO groups (data) VALUES (?)"); | ||
| 823 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 824 | { | ||
| 825 | db_error(ppdb, query); | ||
| 826 | } | ||
| 827 | |||
| 828 | sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_STATIC); | ||
| 829 | |||
| 830 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 831 | { | ||
| 832 | db_error(ppdb, query); | ||
| 833 | } | ||
| 834 | |||
| 835 | sqlite3_finalize(ppstmt); | ||
| 836 | |||
| 837 | query = "SELECT last_insert_rowid()"; | ||
| 838 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 839 | { | ||
| 840 | db_error(ppdb, query); | ||
| 841 | } | ||
| 842 | |||
| 843 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
| 844 | { | ||
| 845 | db_error(ppdb, query); | ||
| 846 | } | ||
| 847 | |||
| 848 | int gid = sqlite3_column_int(ppstmt, 0); | ||
| 849 | sqlite3_finalize(ppstmt); | ||
| 850 | |||
| 851 | for (auto frame : mapping.second.frames) | ||
| 852 | { | ||
| 853 | std::list<json> fdatap; | ||
| 854 | std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { | ||
| 855 | json part; | ||
| 856 | |||
| 857 | switch (fp.type) | ||
| 858 | { | ||
| 859 | case framepart_t::type_t::np: | ||
| 860 | { | ||
| 861 | part["type"] = "np"; | ||
| 862 | part["role"] = fp.role; | ||
| 863 | part["selrestrs"] = export_selrestrs(fp.selrestrs); | ||
| 864 | part["synrestrs"] = fp.synrestrs; | ||
| 865 | |||
| 866 | break; | ||
| 867 | } | ||
| 868 | |||
| 869 | case framepart_t::type_t::pp: | ||
| 870 | { | ||
| 871 | part["type"] = "pp"; | ||
| 872 | part["values"] = fp.choices; | ||
| 873 | part["preprestrs"] = fp.preprestrs; | ||
| 874 | |||
| 875 | break; | ||
| 876 | } | ||
| 877 | |||
| 878 | case framepart_t::type_t::v: | ||
| 879 | { | ||
| 880 | part["type"] = "v"; | ||
| 881 | |||
| 882 | break; | ||
| 883 | } | ||
| 884 | |||
| 885 | case framepart_t::type_t::adj: | ||
| 886 | { | ||
| 887 | part["type"] = "adj"; | ||
| 888 | |||
| 889 | break; | ||
| 890 | } | ||
| 891 | |||
| 892 | case framepart_t::type_t::adv: | ||
| 893 | { | ||
| 894 | part["type"] = "adv"; | ||
| 895 | |||
| 896 | break; | ||
| 897 | } | ||
| 898 | |||
| 899 | case framepart_t::type_t::lex: | ||
| 900 | { | ||
| 901 | part["type"] = "lex"; | ||
| 902 | part["value"] = fp.lexval; | ||
| 903 | |||
| 904 | break; | ||
| 905 | } | ||
| 906 | } | ||
| 907 | |||
| 908 | return part; | ||
| 909 | }); | ||
| 910 | |||
| 911 | json fdata(fdatap); | ||
| 912 | std::string marshall = fdata.dump(); | ||
| 913 | |||
| 914 | query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; | ||
| 915 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 916 | { | ||
| 917 | db_error(ppdb, query); | ||
| 918 | } | ||
| 919 | |||
| 920 | sqlite3_bind_int(ppstmt, 1, gid); | ||
| 921 | sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_STATIC); | ||
| 922 | |||
| 923 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 924 | { | ||
| 925 | db_error(ppdb, query); | ||
| 926 | } | ||
| 927 | |||
| 928 | sqlite3_finalize(ppstmt); | ||
| 929 | } | ||
| 930 | |||
| 931 | for (auto member : mapping.second.members) | ||
| 932 | { | ||
| 933 | if (verbs.count(member) == 1) | ||
| 934 | { | ||
| 935 | auto& v = verbs[member]; | ||
| 936 | |||
| 937 | query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; | ||
| 938 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 939 | { | ||
| 940 | db_error(ppdb, query); | ||
| 941 | } | ||
| 942 | |||
| 943 | sqlite3_bind_int(ppstmt, 1, v.id); | ||
| 944 | sqlite3_bind_int(ppstmt, 2, gid); | ||
| 945 | |||
| 946 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 947 | { | ||
| 948 | db_error(ppdb, query); | ||
| 949 | } | ||
| 950 | |||
| 951 | sqlite3_finalize(ppstmt); | ||
| 952 | } | ||
| 953 | } | ||
| 954 | |||
| 955 | ppgs.update(); | ||
| 956 | } | ||
| 957 | } | ||
| 958 | |||
| 458 | // Get nouns/adjectives/adverbs from WordNet | 959 | // Get nouns/adjectives/adverbs from WordNet |
| 459 | // Useful relations: | 960 | // Useful relations: |
| 460 | // - s: master list | 961 | // - s: master list |
