From 6746da6edd7d9d50efe374eabbb79a3cac882d81 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Mon, 16 Jan 2017 18:02:50 -0500 Subject: Started structural rewrite The new object structure was designed to build on the existing WordNet structure, while also adding in all of the data that we get from other sources. More information about this can be found on the project wiki. The generator has already been completely rewritten to generate a datafile that uses the new structure. In addition, a number of indexes are created, which does double the size of the datafile, but also allows for much faster lookups. Finally, the new generator is written modularly and is a lot more readable than the old one. The verbly interface to the new object structure has mostly been completed, but has not been tested fully. There is a completely new search API which utilizes a lot of operator overloading; documentation on how to use it should go up at some point. Token processing and verb frames are currently unimplemented. Source for these have been left in the repository for now. --- generator/CMakeLists.txt | 6 +- generator/database.cpp | 173 +++ generator/database.h | 73 + generator/field.cpp | 193 +++ generator/field.h | 76 + generator/form.cpp | 53 + generator/form.h | 71 + generator/frame.cpp | 83 ++ generator/frame.h | 59 + generator/generator.cpp | 3227 ++++++++++++++++--------------------------- generator/generator.h | 151 ++ generator/group.cpp | 119 ++ generator/group.h | 80 ++ generator/lemma.cpp | 65 + generator/lemma.h | 58 + generator/main.cpp | 40 + generator/notion.cpp | 85 ++ generator/notion.h | 91 ++ generator/part.cpp | 336 +++++ generator/part.h | 114 ++ generator/progress.h | 78 +- generator/pronunciation.cpp | 87 ++ generator/pronunciation.h | 82 ++ generator/role.h | 35 + generator/schema.sql | 352 ++--- generator/selrestr.cpp | 288 ++++ generator/selrestr.h | 88 ++ generator/word.cpp | 77 ++ generator/word.h | 110 ++ 29 files changed, 4059 insertions(+), 2291 deletions(-) create mode 100644 generator/database.cpp create mode 100644 generator/database.h create mode 100644 generator/field.cpp create mode 100644 generator/field.h create mode 100644 generator/form.cpp create mode 100644 generator/form.h create mode 100644 generator/frame.cpp create mode 100644 generator/frame.h create mode 100644 generator/generator.h create mode 100644 generator/group.cpp create mode 100644 generator/group.h create mode 100644 generator/lemma.cpp create mode 100644 generator/lemma.h create mode 100644 generator/main.cpp create mode 100644 generator/notion.cpp create mode 100644 generator/notion.h create mode 100644 generator/part.cpp create mode 100644 generator/part.h create mode 100644 generator/pronunciation.cpp create mode 100644 generator/pronunciation.h create mode 100644 generator/role.h create mode 100644 generator/selrestr.cpp create mode 100644 generator/selrestr.h create mode 100644 generator/word.cpp create mode 100644 generator/word.h (limited to 'generator') diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt @@ -1,12 +1,12 @@ -cmake_minimum_required (VERSION 2.6) +cmake_minimum_required (VERSION 3.1) project (generator) find_package(PkgConfig) pkg_check_modules(sqlite3 sqlite3 REQUIRED) find_package(libxml2 REQUIRED) -include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) -add_executable(generator generator.cpp) +include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json) +add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp) set_property(TARGET generator PROPERTY CXX_STANDARD 11) set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp @@ -0,0 +1,173 @@ +#include "database.h" +#include +#include +#include +#include +#include +#include +#include "field.h" +#include "../lib/util.h" + +namespace verbly { + namespace generator { + + sqlite3_error::sqlite3_error( + const std::string& what, + const std::string& db_err) : + what_(what + " (" + db_err + ")"), + db_err_(db_err) + { + } + + const char* sqlite3_error::what() const noexcept + { + return what_.c_str(); + } + + const char* sqlite3_error::db_err() const noexcept + { + return db_err_.c_str(); + } + + database::database(std::string path) + { + // If there is already a file at this path, overwrite it. + if (std::ifstream(path)) + { + if (std::remove(path.c_str())) + { + throw std::logic_error("Could not overwrite file at path"); + } + } + + if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) + { + // We still have to free the resources allocated. In the event that + // allocation failed, ppdb will be null and sqlite3_close_v2 will just + // ignore it. + std::string errmsg(sqlite3_errmsg(ppdb_)); + sqlite3_close_v2(ppdb_); + + throw sqlite3_error("Could not create output datafile", errmsg); + } + } + + database::database(database&& other) : database() + { + swap(*this, other); + } + + database& database::operator=(database&& other) + { + swap(*this, other); + + return *this; + } + + void swap(database& first, database& second) + { + std::swap(first.ppdb_, second.ppdb_); + } + + database::~database() + { + sqlite3_close_v2(ppdb_); + } + + void database::runQuery(std::string query) + { + // This can only happen when doing bad things with move semantics. + assert(ppdb_ != nullptr); + + sqlite3_stmt* ppstmt; + + if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + { + throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); + } + + int result = sqlite3_step(ppstmt); + sqlite3_finalize(ppstmt); + + if (result != SQLITE_DONE) + { + throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); + } + } + + void database::insertIntoTable(std::string table, std::list fields) + { + // This can only happen when doing bad things with move semantics. + assert(ppdb_ != nullptr); + + // This shouldn't happen. + assert(!fields.empty()); + + std::list fieldNames; + std::list qs; + for (field& f : fields) + { + fieldNames.push_back(f.getName()); + qs.push_back("?"); + } + + std::ostringstream query; + query << "INSERT INTO "; + query << table; + query << " ("; + query << implode(std::begin(fieldNames), std::end(fieldNames), ", "); + query << ") VALUES ("; + query << implode(std::begin(qs), std::end(qs), ", "); + query << ")"; + + std::string query_str = query.str(); + + sqlite3_stmt* ppstmt; + + if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK) + { + throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); + } + + int i = 1; + for (field& f : fields) + { + switch (f.getType()) + { + case field::type::integer: + { + sqlite3_bind_int(ppstmt, i, f.getInteger()); + + break; + } + + case field::type::string: + { + sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT); + + break; + } + + case field::type::invalid: + { + // Fields can only be invalid when doing bad things with move semantics. + assert(false); + + break; + } + } + + i++; + } + + int result = sqlite3_step(ppstmt); + sqlite3_finalize(ppstmt); + + if (result != SQLITE_DONE) + { + throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); + } + } + + }; +}; diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h @@ -0,0 +1,73 @@ +#ifndef DATABASE_H_0B0A47D2 +#define DATABASE_H_0B0A47D2 + +#include +#include +#include + +struct sqlite3; + +namespace verbly { + namespace generator { + + class field; + + class sqlite3_error : public std::exception { + public: + + sqlite3_error(const std::string& what, const std::string& db_err); + + const char* what() const noexcept override; + const char* db_err() const noexcept; + + private: + std::string what_; + std::string db_err_; + + }; + + class database { + public: + + // Constructor + + explicit database(std::string path); + + // Disable copying + + database(const database& other) = delete; + database& operator=(const database& other) = delete; + + // Move constructor and move assignment + + database(database&& other); + database& operator=(database&& other); + + // Swap + + friend void swap(database& first, database& second); + + // Destructor + + ~database(); + + // Actions + + void runQuery(std::string query); + + void insertIntoTable(std::string table, std::list fields); + + private: + + database() + { + } + + sqlite3* ppdb_ = nullptr; + + }; + + }; +}; + +#endif /* end of include guard: DATABASE_H_0B0A47D2 */ diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp @@ -0,0 +1,193 @@ +#include "field.h" +#include +#include + +namespace verbly { + namespace generator { + + field::field(const field& other) + { + type_ = other.type_; + name_ = other.name_; + + switch (type_) + { + case type::integer: + { + integer_ = other.integer_; + + break; + } + + case type::string: + { + new(&string_) std::string(other.string_); + + break; + } + + case type::invalid: + { + break; + } + } + } + + field::field(field&& other) : field() + { + swap(*this, other); + } + + field& field::operator=(field other) + { + swap(*this, other); + + return *this; + } + + void swap(field& first, field& second) + { + using type = field::type; + + type tempType = first.type_; + std::string tempName = std::move(first.name_); + int tempInteger; + std::string tempString; + + switch (first.type_) + { + case type::integer: + { + tempInteger = first.integer_; + + break; + } + + case type::string: + { + tempString = std::move(tempString); + + break; + } + + case type::invalid: + { + break; + } + } + + first.~field(); + + first.type_ = second.type_; + first.name_ = std::move(second.name_); + + switch (second.type_) + { + case type::integer: + { + first.integer_ = second.integer_; + + break; + } + + case type::string: + { + new(&first.string_) std::string(std::move(second.string_)); + + break; + } + + case type::invalid: + { + break; + } + } + + second.~field(); + + second.type_ = tempType; + second.name_ = std::move(tempName); + + switch (tempType) + { + case type::integer: + { + second.integer_ = tempInteger; + + break; + } + + case type::string: + { + new(&second.string_) std::string(std::move(tempString)); + + break; + } + + case type::invalid: + { + break; + } + } + } + + field::~field() + { + switch (type_) + { + case type::string: + { + using string_type = std::string; + string_.~string_type(); + + break; + } + + case type::integer: + case type::invalid: + { + break; + } + } + } + + field::field( + std::string name, + int arg) : + type_(type::integer), + name_(name), + integer_(arg) + { + } + + int field::getInteger() const + { + if (type_ != type::integer) + { + throw std::domain_error("field::getInteger called on non-integer field"); + } + + return integer_; + } + + field::field( + std::string name, + std::string arg) : + type_(type::string), + name_(name) + { + new(&string_) std::string(arg); + } + + std::string field::getString() const + { + if (type_ != type::string) + { + throw std::domain_error("field::getString called on non-string field"); + } + + return string_; + } + + }; +}; diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h @@ -0,0 +1,76 @@ +#ifndef BINDING_H_CAE0B18E +#define BINDING_H_CAE0B18E + +#include + +namespace verbly { + namespace generator { + + class field { + public: + enum class type { + invalid, + integer, + string + }; + + // Copy and move constructors + + field(const field& other); + field(field&& other); + + // Assignment + + field& operator=(field other); + + // Swap + + friend void swap(field& first, field& second); + + // Destructor + + ~field(); + + // Generic accessors + + type getType() const + { + return type_; + } + + std::string getName() const + { + return name_; + } + + // Integer + + field(std::string name, int arg); + + int getInteger() const; + + // String + + field(std::string name, std::string arg); + + std::string getString() const; + + private: + + field() + { + } + + union { + int integer_; + std::string string_; + }; + + type type_ = type::invalid; + std::string name_; + }; + + }; +}; + +#endif /* end of include guard: BINDING_H_CAE0B18E */ diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp @@ -0,0 +1,53 @@ +#include "form.h" +#include +#include +#include "database.h" +#include "field.h" +#include "pronunciation.h" + +namespace verbly { + namespace generator { + + int form::nextId_ = 0; + + form::form(std::string text) : + id_(nextId_++), + text_(text), + complexity_(std::count(std::begin(text), std::end(text), ' ') + 1), + proper_(std::any_of(std::begin(text), std::end(text), std::isupper)) + { + } + + void form::addPronunciation(const pronunciation& p) + { + pronunciations_.insert(&p); + } + + database& operator<<(database& db, const form& arg) + { + // Serialize the form first. + { + std::list fields; + fields.emplace_back("form_id", arg.getId()); + fields.emplace_back("form", arg.getText()); + fields.emplace_back("complexity", arg.getComplexity()); + fields.emplace_back("proper", arg.isProper()); + + db.insertIntoTable("forms", std::move(fields)); + } + + // Then, serialize the form/pronunciation relationship. + for (const pronunciation* p : arg.getPronunciations()) + { + std::list fields; + fields.emplace_back("form_id", arg.getId()); + fields.emplace_back("pronunciation_id", p->getId()); + + db.insertIntoTable("forms_pronunciations", std::move(fields)); + } + + return db; + } + + }; +}; diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h @@ -0,0 +1,71 @@ +#ifndef FORM_H_7EFBC970 +#define FORM_H_7EFBC970 + +#include +#include + +namespace verbly { + namespace generator { + + class pronunciation; + class database; + + class form { + public: + + // Constructor + + explicit form(std::string text); + + // Mutators + + void addPronunciation(const pronunciation& p); + + // Accessors + + int getId() const + { + return id_; + } + + std::string getText() const + { + return text_; + } + + int getComplexity() const + { + return complexity_; + } + + bool isProper() const + { + return proper_; + } + + std::set getPronunciations() const + { + return pronunciations_; + } + + private: + + static int nextId_; + + const int id_; + const std::string text_; + const int complexity_; + const bool proper_; + + std::set pronunciations_; + + }; + + // Serializer + + database& operator<<(database& db, const form& arg); + + }; +}; + +#endif /* end of include guard: FORM_H_7EFBC970 */ diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp @@ -0,0 +1,83 @@ +#include "frame.h" +#include "database.h" +#include "field.h" + +namespace verbly { + namespace generator { + + int frame::nextId_ = 0; + + frame::frame() : id_(nextId_++) + { + } + + void frame::push_back(part fp) + { + parts_.push_back(std::move(fp)); + } + + database& operator<<(database& db, const frame& arg) + { + std::list fields; + fields.emplace_back("frame_id", arg.getId()); + + nlohmann::json jsonParts; + for (const part& p : arg) + { + nlohmann::json jsonPart; + jsonPart["type"] = static_cast(p.getType()); + + switch (p.getType()) + { + case part::type::noun_phrase: + { + jsonPart["role"] = p.getNounRole(); + jsonPart["selrestrs"] = p.getNounSelrestrs().toJson(); + jsonPart["synrestrs"] = p.getNounSynrestrs(); + + break; + } + + case part::type::preposition: + { + jsonPart["choices"] = p.getPrepositionChoices(); + jsonPart["literal"] = p.isPrepositionLiteral(); + + break; + } + + case part::type::literal: + { + jsonPart["value"] = p.getLiteralValue(); + + break; + } + + case part::type::verb: + case part::type::adjective: + case part::type::adverb: + { + break; + } + + case part::type::invalid: + { + // Invalid parts should not be serialized. + assert(false); + + break; + } + } + + jsonParts.emplace_back(std::move(jsonPart)); + } + + fields.emplace_back("data", jsonParts.dump()); + + db.insertIntoTable("frames", std::move(fields)); + + return db; + } + + }; +}; diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h @@ -0,0 +1,59 @@ +#ifndef FRAME_H_26770FF1 +#define FRAME_H_26770FF1 + +#include +#include "part.h" + +namespace verbly { + namespace generator { + + class database; + + class frame { + public: + + // Aliases + + using const_iterator = std::list::const_iterator; + + // Constructor + + frame(); + + // Mutators + + void push_back(part fp); + + // Accessors + + int getId() const + { + return id_; + } + + const_iterator begin() const + { + return std::begin(parts_); + } + + const_iterator end() const + { + return std::end(parts_); + } + + private: + + static int nextId_; + + const int id_; + + std::list parts_; + + }; + + database& operator<<(database& db, const frame& arg); + + }; +}; + +#endif /* end of include guard: FRAME_H_26770FF1 */ diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -1,2320 +1,1477 @@ -#include +#include "generator.h" +#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "progress.h" -#include "../lib/util.h" - -using json = nlohmann::json; - -struct verb_t { - std::string infinitive; - std::string past_tense; - std::string past_participle; - std::string ing_form; - std::string s_form; - int id; -}; - -struct adjective_t { - std::string base; - std::string comparative; - std::string superlative; -}; - -struct noun_t { - std::string singular; - std::string plural; -}; - -struct selrestr_t { - enum class type_t { - singleton, - andlogic, - orlogic, - empty - }; - type_t type; - std::string restriction; - bool pos; - std::list subordinates; -}; - -struct framepart_t { - enum class type_t { - np, - v, - pp, - adj, - adv, - lex - }; - type_t type; - std::string role; - selrestr_t selrestrs; - std::set preprestrs; - std::set synrestrs; - std::list choices; - std::string lexval; -}; - -struct group_t { - std::string id; - std::string parent; - std::set members; - std::map roles; - std::list> frames; -}; - -struct pronunciation_t { - std::string phonemes; - std::string prerhyme; - std::string rhyme; - int syllables = 0; - std::string stress; - - bool operator<(const pronunciation_t& other) const - { - return phonemes < other.phonemes; - } -}; - -std::map groups; -std::map verbs; -std::map adjectives; -std::map nouns; -std::map> wn; -std::map images; -std::map> pronunciations; - -void print_usage() -{ - std::cout << "Verbly Datafile Generator" << std::endl; - std::cout << "-------------------------" << std::endl; - std::cout << "Requires exactly six arguments." << std::endl; - std::cout << "1. The path to a VerbNet data directory." << std::endl; - std::cout << "2. The path to an AGID infl.txt file." << std::endl; - std::cout << "3. The path to a WordNet prolog data directory." << std::endl; - std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; - std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; - std::cout << "6. Datafile output path." << std::endl; - - exit(1); -} - -void db_error(sqlite3* ppdb, std::string query) -{ - std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; - std::cout << query << std::endl; - sqlite3_close_v2(ppdb); - print_usage(); -} - -json export_selrestrs(selrestr_t r) -{ - if (r.type == selrestr_t::type_t::empty) - { - return {}; - } else if (r.type == selrestr_t::type_t::singleton) - { - json result; - result["type"] = r.restriction; - result["pos"] = r.pos; - return result; - } else { - json result; - if (r.type == selrestr_t::type_t::andlogic) - { - result["logic"] = "and"; - } else { - result["logic"] = "or"; - } - - std::list outlist; - std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); - result["children"] = outlist; - - return result; - } -} - -selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) -{ - selrestr_t r; - xmlChar* key; - - if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) - { - if (xmlChildElementCount(top) == 0) - { - r.type = selrestr_t::type_t::empty; - } else if (xmlChildElementCount(top) == 1) - { - r = parse_selrestrs(xmlFirstElementChild(top), filename); - } else { - r.type = selrestr_t::type_t::andlogic; - - if (xmlHasProp(top, (const xmlChar*) "logic")) - { - key = xmlGetProp(top, (const xmlChar*) "logic"); - if (!xmlStrcmp(key, (const xmlChar*) "or")) - { - r.type = selrestr_t::type_t::orlogic; - } - xmlFree(key); - } - - for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) - { - if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) - { - r.subordinates.push_back(parse_selrestrs(selrestr, filename)); - } - } - } - } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) - { - r.type = selrestr_t::type_t::singleton; - - key = xmlGetProp(top, (xmlChar*) "Value"); - r.pos = (std::string((const char*)key) == "+"); - xmlFree(key); - - key = xmlGetProp(top, (xmlChar*) "type"); - r.restriction = (const char*) key; - xmlFree(key); - } else { - // Invalid - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - - return r; -} - -group_t& parse_group(xmlNodePtr top, std::string filename) -{ - xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); - if (key == 0) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - std::string vnid = (const char*)key; - vnid = vnid.substr(vnid.find_first_of("-")+1); - xmlFree(key); - - group_t g; - g.id = vnid; - - for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) - { - if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) - { - for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) - { - if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) - { - auto& sg = parse_group(subclass, filename); - sg.parent = vnid; - - for (auto member : sg.members) - { - g.members.insert(member); - } - - // The schema requires that subclasses appear after role definitions, so we can do this now - for (auto role : g.roles) - { - if (sg.roles.count(role.first) == 0) - { - sg.roles[role.first] = role.second; - } - } - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) - { - for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) - { - if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) - { - key = xmlGetProp(member, (xmlChar*) "name"); - g.members.insert((const char*)key); - xmlFree(key); - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) - { - for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) - { - if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) - { - selrestr_t r; - r.type = selrestr_t::type_t::empty; - - key = xmlGetProp(role, (const xmlChar*) "type"); - std::string type = (const char*)key; - xmlFree(key); - - for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) - { - if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) - { - r = parse_selrestrs(rolenode, filename); - } - } - - g.roles[type] = r; - } - } - } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) - { - for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) - { - if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) - { - std::list f; - - for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) - { - if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) - { - for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) - { - framepart_t fp; - - if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) - { - fp.type = framepart_t::type_t::np; - - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - fp.role = (const char*)key; - xmlFree(key); - - fp.selrestrs.type = selrestr_t::type_t::empty; - - for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) - { - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) - { - for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) - { - if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) - { - key = xmlGetProp(synrestr, (xmlChar*) "type"); - fp.synrestrs.insert(std::string((const char*)key)); - xmlFree(key); - } - } - } - - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) - { - fp.selrestrs = parse_selrestrs(npnode, filename); - } - } - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) - { - fp.type = framepart_t::type_t::v; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) - { - fp.type = framepart_t::type_t::pp; - - if (xmlHasProp(syntaxnode, (xmlChar*) "value")) - { - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - std::string choices = (const char*)key; - xmlFree(key); - - fp.choices = verbly::split>(choices, " "); - } - - for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) - { - if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) - { - for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) - { - if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) - { - key = xmlGetProp(synrestr, (xmlChar*) "type"); - fp.preprestrs.insert(std::string((const char*)key)); - xmlFree(key); - } - } - } - } - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) - { - fp.type = framepart_t::type_t::adj; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) - { - fp.type = framepart_t::type_t::adv; - } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) - { - fp.type = framepart_t::type_t::lex; - - key = xmlGetProp(syntaxnode, (xmlChar*) "value"); - fp.lexval = (const char*)key; - xmlFree(key); - } else { - continue; - } - - f.push_back(fp); - } - - g.frames.push_back(f); - } - } - } - } - } - } - - groups[vnid] = g; - - return groups[vnid]; -} - -int main(int argc, char** argv) -{ - if (argc != 7) - { - print_usage(); - } - - // VerbNet data - std::cout << "Reading verb frames..." << std::endl; - - DIR* dir; - if ((dir = opendir(argv[1])) == nullptr) - { - std::cout << "Invalid VerbNet data directory." << std::endl; - - print_usage(); - } - - struct dirent* ent; - while ((ent = readdir(dir)) != nullptr) - { - std::string filename(argv[1]); - if (filename.back() != '/') - { - filename += '/'; - } - - filename += ent->d_name; - //std::cout << ent->d_name << std::endl; - - if (filename.rfind(".xml") != filename.size() - 4) - { - continue; - } - - xmlDocPtr doc = xmlParseFile(filename.c_str()); - if (doc == nullptr) - { - std::cout << "Error opening " << filename << std::endl; - print_usage(); - } - - xmlNodePtr top = xmlDocGetRootElement(doc); - if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) - { - std::cout << "Bad VerbNet file format: " << filename << std::endl; - print_usage(); - } - - parse_group(top, filename); - } - - closedir(dir); - - // Get verbs from AGID - std::cout << "Reading inflections..." << std::endl; - - std::ifstream agidfile(argv[2]); - if (!agidfile.is_open()) - { - std::cout << "Could not open AGID file: " << argv[2] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(agidfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - int divider = line.find_first_of(" "); - std::string word = line.substr(0, divider); - line = line.substr(divider+1); - char type = line[0]; - - if (line[1] == '?') - { - line.erase(0, 4); - } else { - line.erase(0, 3); - } - - std::vector forms; - while (!line.empty()) - { - std::string inflection; - if ((divider = line.find(" | ")) != std::string::npos) - { - inflection = line.substr(0, divider); - line = line.substr(divider + 3); - } else { - inflection = line; - line = ""; - } - - if ((divider = inflection.find_first_of(",?")) != std::string::npos) - { - inflection = inflection.substr(0, divider); - } - - forms.push_back(inflection); - } - - switch (type) - { - case 'V': - { - verb_t v; - v.infinitive = word; - if (forms.size() == 4) - { - v.past_tense = forms[0]; - v.past_participle = forms[1]; - v.ing_form = forms[2]; - v.s_form = forms[3]; - } else if (forms.size() == 3) - { - v.past_tense = forms[0]; - v.past_participle = forms[0]; - v.ing_form = forms[1]; - v.s_form = forms[2]; - } else if (forms.size() == 8) - { - // As of AGID 2014.08.11, this is only "to be" - v.past_tense = forms[0]; - v.past_participle = forms[2]; - v.ing_form = forms[3]; - v.s_form = forms[4]; - } else { - // Words that don't fit the cases above as of AGID 2014.08.11: - // - may and shall do not conjugate the way we want them to - // - methinks only has a past tense and is an outlier - // - wit has five forms, and is archaic/obscure enough that we can ignore it for now - std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - verbs[word] = v; - - break; - } - - case 'A': - { - adjective_t adj; - adj.base = word; - if (forms.size() == 2) - { - adj.comparative = forms[0]; - adj.superlative = forms[1]; - } else { - // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" - std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - adjectives[word] = adj; - - break; - } - - case 'N': - { - noun_t n; - n.singular = word; - if (forms.size() == 1) - { - n.plural = forms[0]; - } else { - // As of AGID 2014.08.11, this is non-existent. - std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; - } - - nouns[word] = n; - - break; - } - } - } - - // Pronounciations - std::cout << "Reading pronunciations..." << std::endl; - - std::ifstream pronfile(argv[4]); - if (!pronfile.is_open()) - { - std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(pronfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); - std::smatch phoneme_data; - if (std::regex_search(line, phoneme_data, phoneme)) - { - std::string canonical(phoneme_data[1]); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - - std::string phonemes = phoneme_data[2]; - auto phoneme_set = verbly::split>(phonemes, " "); - auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { - return phoneme.find("1") != std::string::npos; - }); - - pronunciation_t p; - p.phonemes = phonemes; - - // Rhyme detection - if (phemstrt != std::end(phoneme_set)) - { - std::stringstream rhymer; - for (auto it = phemstrt; it != std::end(phoneme_set); it++) - { - std::string naked; - std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { - return isdigit(ch); - }); - - if (it != phemstrt) - { - rhymer << " "; - } - - rhymer << naked; - } - - p.rhyme = rhymer.str(); - - if (phemstrt != std::begin(phoneme_set)) - { - phemstrt--; - p.prerhyme = *phemstrt; - } else { - p.prerhyme = ""; - } - } else { - p.prerhyme = ""; - p.rhyme = ""; - } - - // Syllable/stress - for (auto phm : phoneme_set) - { - if (isdigit(phm.back())) - { - // It's a vowel! - p.syllables++; - - if (phm.back() == '1') - { - p.stress.push_back('1'); - } else { - p.stress.push_back('0'); - } - } - } - - pronunciations[canonical].insert(p); - } - } - - // Images - std::cout << "Reading images..." << std::endl; - - std::ifstream imagefile(argv[5]); - if (!imagefile.is_open()) - { - std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(imagefile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::string wnid_s = line.substr(1, 8); - int wnid = stoi(wnid_s) + 100000000; - images[wnid]++; - } - - imagefile.close(); - - // Start writing output - std::cout << "Writing schema..." << std::endl; - - sqlite3* ppdb; - if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) - { - std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; - print_usage(); - } - - std::ifstream schemafile("schema.sql"); - if (!schemafile.is_open()) - { - std::cout << "Could not find schema file" << std::endl; - print_usage(); - } - - std::stringstream schemabuilder; - for (;;) - { - std::string line; - if (!getline(schemafile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - schemabuilder << line << std::endl; - } - - std::string schema = schemabuilder.str(); - while (!schema.empty()) - { - std::string query; - int divider = schema.find(";"); - if (divider != std::string::npos) - { - query = schema.substr(0, divider+1); - schema = schema.substr(divider+2); - } else { - break; - } - - sqlite3_stmt* schmstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(schmstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(schmstmt); - } - - std::cout << "Writing prepositions..." << std::endl; - std::ifstream prepfile("prepositions.txt"); - if (!prepfile.is_open()) - { - std::cout << "Could not find prepositions file" << std::endl; - print_usage(); - } - - for (;;) - { - std::string line; - if (!getline(prepfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - std::regex relation("^([^:]+): (.+)"); - std::smatch relation_data; - std::regex_search(line, relation_data, relation); - std::string prep = relation_data[1]; - std::list groups = verbly::split>(relation_data[2], ", "); - - std::string query("INSERT INTO prepositions (form) VALUES (?)"); - sqlite3_stmt* ppstmt; - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - sqlite3_finalize(ppstmt); - - for (auto group : groups) - { - query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - - { - progress ppgs("Writing verbs...", verbs.size()); - for (auto& mapping : verbs) - { - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - std::string canonical(mapping.second.infinitive); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) - { - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) - { - db_error(ppdb, query); - } - - int rowid = sqlite3_column_int(ppstmt, 0); - - sqlite3_finalize(ppstmt); - - mapping.second.id = rowid; - - for (auto pronunciation : pronunciations[canonical]) - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); - sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); - - if (!pronunciation.rhyme.empty()) - { - sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); - } - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } - } - - ppgs.update(); - } - } - - { - progress ppgs("Writing verb frames...", groups.size()); - for (auto& mapping : groups) +#include +#include +#include +#include "enums.h" +#include "progress.h" +#include "selrestr.h" +#include "role.h" +#include "part.h" +#include "field.h" +#include "../lib/util.h" + +namespace verbly { + namespace generator { + + generator::generator( + std::string verbNetPath, + std::string agidPath, + std::string wordNetPath, + std::string cmudictPath, + std::string imageNetPath, + std::string outputPath) : + verbNetPath_(verbNetPath), + agidPath_(agidPath), + wordNetPath_(wordNetPath), + cmudictPath_(cmudictPath), + imageNetPath_(imageNetPath), + db_(outputPath) { - std::list roledatal; - std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair r) { - json role; - role["type"] = r.first; - role["selrestrs"] = export_selrestrs(r.second); - - return role; - }); - - json roledata(roledatal); - std::string rdm = roledata.dump(); - - sqlite3_stmt* ppstmt; - std::string query("INSERT INTO groups (data) VALUES (?)"); - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + // Ensure VerbNet directory exists + DIR* dir; + if ((dir = opendir(verbNetPath_.c_str())) == nullptr) { - db_error(ppdb, query); + throw std::invalid_argument("Invalid VerbNet data directory"); } - sqlite3_finalize(ppstmt); + closedir(dir); - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + // Ensure AGID infl.txt exists + if (!std::ifstream(agidPath_)) { - db_error(ppdb, query); + throw std::invalid_argument("AGID infl.txt file not found"); } - if (sqlite3_step(ppstmt) != SQLITE_ROW) + // Add directory separator to WordNet path + if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { - db_error(ppdb, query); + wordNetPath_ += '/'; } - int gid = sqlite3_column_int(ppstmt, 0); - sqlite3_finalize(ppstmt); - - for (auto frame : mapping.second.frames) + // Ensure WordNet tables exist + for (std::string table : { + "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" + }) { - std::list fdatap; - std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { - json part; - - switch (fp.type) - { - case framepart_t::type_t::np: - { - part["type"] = "np"; - part["role"] = fp.role; - part["selrestrs"] = export_selrestrs(fp.selrestrs); - part["synrestrs"] = fp.synrestrs; - - break; - } - - case framepart_t::type_t::pp: - { - part["type"] = "pp"; - part["values"] = fp.choices; - part["preprestrs"] = fp.preprestrs; - - break; - } - - case framepart_t::type_t::v: - { - part["type"] = "v"; - - break; - } - - case framepart_t::type_t::adj: - { - part["type"] = "adj"; - - break; - } - - case framepart_t::type_t::adv: - { - part["type"] = "adv"; - - break; - } - - case framepart_t::type_t::lex: - { - part["type"] = "lex"; - part["value"] = fp.lexval; - - break; - } - } - - return part; - }); - - json fdata(fdatap); - std::string marshall = fdata.dump(); - - query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, gid); - sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { - db_error(ppdb, query); + throw std::invalid_argument("WordNet " + table + " table not found"); } - - sqlite3_finalize(ppstmt); } - for (auto member : mapping.second.members) + // Ensure CMUDICT file exists + if (!std::ifstream(cmudictPath_)) { - if (verbs.count(member) == 1) - { - auto& v = verbs[member]; - - query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, v.id); - sqlite3_bind_int(ppstmt, 2, gid); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - } + throw std::invalid_argument("CMUDICT file not found"); } - ppgs.update(); - } - } - - // Get nouns/adjectives/adverbs from WordNet - // Useful relations: - // - s: master list - // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) - // - at: variation (e.g. a measurement can be standard or nonstandard) - // - der: derivation (e.g. happy/happily, happily/happy) - // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) - // - ins: instantiation (do we need this? let's see) - // - mm: member meronymy/holonymy (e.g. family/mother, family/child) - // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) - // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) - // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) - // mannernymy (e.g. something done quickly is done in a manner that is quick) - // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) - // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) - // - syntax: positioning flags for some adjectives - std::string wnpref {argv[3]}; - if (wnpref.back() != '/') - { - wnpref += '/'; - } - - // s table - { - std::ifstream wnsfile(wnpref + "wn_s.pl"); - if (!wnsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsfile, line)) + // Ensure ImageNet urls.txt exists + if (!std::ifstream(imageNetPath_)) { - break; + throw std::invalid_argument("ImageNet urls.txt file not found"); } + } - if (line.back() == '\r') - { - line.pop_back(); - } + void generator::run() + { + // Create notions, words, lemmas, and forms from WordNet synsets + readWordNetSynsets(); + + // Reads adjective positioning WordNet data + readAdjectivePositioning(); + + // Counts the number of URLs ImageNet has per notion + readImageNetUrls(); + + // Creates a word by WordNet sense key lookup table + readWordNetSenseKeys(); + + // Creates groups and frames from VerbNet data + readVerbNet(); + + // Creates forms and inflections from AGID. To reduce the amount of forms + // created, we do this after most lemmas that need inflecting have been + // created through other means, and then only generate forms for + // inflections of already-existing lemmas. The exception to this regards + // verb lemmas. If a verb lemma in AGID either does not exist yet, or does + // exist but is not related to any words that are related to verb notions, + // then a notion and a word is generated and the form generation proceeds + // as usual. + readAgidInflections(); + + // Reads in prepositions and the is_a relationship + readPrepositions(); + + // Creates pronunciations from CMUDICT. To reduce the amount of + // pronunciations created, we do this after all forms have been created, + // and then only generate pronunciations for already-exisiting forms. + readCmudictPronunciations(); + + // Writes the database schema + writeSchema(); + + // Dumps data to the database + dumpObjects(); + + // Populates the antonymy relationship from WordNet + readWordNetAntonymy(); + + // Populates the variation relationship from WordNet + readWordNetVariation(); + + // Populates the usage, topicality, and regionality relationships from + // WordNet + readWordNetClasses(); + + // Populates the causality relationship from WordNet + readWordNetCausality(); + + // Populates the entailment relationship from WordNet + readWordNetEntailment(); + + // Populates the hypernymy relationship from WordNet + readWordNetHypernymy(); + + // Populates the instantiation relationship from WordNet + readWordNetInstantiation(); + + // Populates the member meronymy relationship from WordNet + readWordNetMemberMeronymy(); + + // Populates the part meronymy relationship from WordNet + readWordNetPartMeronymy(); + + // Populates the substance meronymy relationship from WordNet + readWordNetSubstanceMeronymy(); + + // Populates the pertainymy and mannernymy relationships from WordNet + readWordNetPertainymy(); + + // Populates the specification relationship from WordNet + readWordNetSpecification(); + + // Populates the adjective similarity relationship from WordNet + readWordNetSimilarity(); + + + + + + + - lines.push_back(line); } - progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); - for (auto line : lines) + void generator::readWordNetSynsets() { - ppgs.update(); + std::list lines(readFile(wordNetPath_ + "wn_s.pl")); + progress ppgs("Reading synsets from WordNet...", lines.size()); - std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string word = relation_data[3]; - size_t word_it; - while ((word_it = word.find("''")) != std::string::npos) - { - word.erase(word_it, 1); - } - - std::string query; - switch (synset_id / 100000000) + for (std::string line : lines) { - case 1: // Noun - { - if (nouns.count(word) == 1) - { - query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; - } + ppgs.update(); - break; - } - - case 2: // Verb + std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - // Ignore - - break; + continue; } - - case 3: // Adjective + + int synset_id = std::stoi(relation_data[1]); + int wnum = std::stoi(relation_data[2]); + std::string text = relation_data[3]; + int tag_count = std::stoi(relation_data[4]); + size_t word_it; + while ((word_it = text.find("''")) != std::string::npos) { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; - } else { - query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; - } - - break; + text.erase(word_it, 1); } - - case 4: // Adverb - { - if (adjectives.count(word) == 1) - { - query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; - } else { - query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; - } - break; + // The WordNet data does contain duplicates, so we need to check that we + // haven't already created this word. + std::pair lookup(synset_id, wnum); + if (!wordByWnidAndWnum_.count(lookup)) + { + notion& synset = lookupOrCreateNotion(synset_id); + lemma& lex = lookupOrCreateLemma(text); + word& entry = createWord(synset, lex, tag_count); + + wordByWnidAndWnum_[lookup] = &entry; } } + } - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); - switch (synset_id / 100000000) + void generator::readAdjectivePositioning() + { + std::list lines(readFile(wordNetPath_ + "wn_syntax.pl")); + progress ppgs("Reading adjective positionings from WordNet...", lines.size()); + + for (std::string line : lines) { - case 1: // Noun + ppgs.update(); + + std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { - return isupper(ch); - }) ? 1 : 0)); - - sqlite3_bind_int(ppstmt, 3, verbly::split>(word, " ").size()); - sqlite3_bind_int(ppstmt, 4, images[synset_id]); - sqlite3_bind_int(ppstmt, 5, synset_id); - - if (nouns.count(word) == 1) - { - sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); - } - - break; + continue; } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string adjpos_str = relation_data[3]; - case 3: // Adjective - case 4: // Adverb + std::pair lookup(synset_id, wnum); + if (wordByWnidAndWnum_.count(lookup)) { - sqlite3_bind_int(ppstmt, 2, verbly::split>(word, " ").size()); + word& adj = *wordByWnidAndWnum_.at(lookup); - if (adjectives.count(word) == 1) + if (adjpos_str == "p") + { + adj.setAdjectivePosition(positioning::predicate); + } else if (adjpos_str == "a") { - sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); + adj.setAdjectivePosition(positioning::attributive); + } else if (adjpos_str == "i") + { + adj.setAdjectivePosition(positioning::postnominal); + } else { + // Can't happen because of how we specified the regex. + assert(false); } - - break; } } + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); - - query = "SELECT last_insert_rowid()"; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + void generator::readImageNetUrls() + { + // The ImageNet datafile is so large that it is unreasonable and + // unnecessary to read it into memory; instead, we will parse each line as + // we read it. This has the caveat that we cannot display a progress bar. + std::cout << "Reading image counts from ImageNet..." << std::endl; + + std::ifstream file(imageNetPath_); + if (!file) { - db_error(ppdb, query); + throw std::invalid_argument("Could not find file " + imageNetPath_); } - - if (sqlite3_step(ppstmt) != SQLITE_ROW) + + std::string line; + while (std::getline(file, line)) { - db_error(ppdb, query); + if (line.back() == '\r') + { + line.pop_back(); + } + + std::string wnid_s = line.substr(1, 8); + int wnid = stoi(wnid_s) + 100000000; + if (notionByWnid_.count(wnid)) + { + // We know that this notion has a wnid and is a noun. + notionByWnid_.at(wnid)->incrementNumOfImages(); + } } + } - int rowid = sqlite3_column_int(ppstmt, 0); - wn[synset_id][wnum] = rowid; - - sqlite3_finalize(ppstmt); + void generator::readWordNetSenseKeys() + { + std::list lines(readFile(wordNetPath_ + "wn_sk.pl")); + progress ppgs("Reading sense keys from WordNet...", lines.size()); - std::string canonical(word); - std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (pronunciations.count(canonical) == 1) + for (std::string line : lines) { - for (auto pronunciation : pronunciations[canonical]) - { - switch (synset_id / 100000000) - { - case 1: // Noun - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - - case 3: // Adjective - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - - case 4: // Adverb - { - if (!pronunciation.rhyme.empty()) - { - query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; - } else { - query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; - } - - break; - } - } - - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + ppgs.update(); - sqlite3_bind_int(ppstmt, 1, rowid); - sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); - sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); - - if (!pronunciation.rhyme.empty()) - { - sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); - sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); - } + // We only actually need to lookup verbs by sense key so we'll just + // ignore everything that isn't a verb. + std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int synset_id = stoi(relation_data[1]); + int wnum = stoi(relation_data[2]); + std::string sense_key = relation_data[3]; - if (sqlite3_step(ppstmt) != SQLITE_DONE) + // We are treating this mapping as injective, which is not entirely + // accurate. First, the WordNet table contains duplicate rows, so those + // need to be ignored. More importantly, a small number of sense keys + // (one for each letter of the Latin alphabet, plus 9 other words) each + // map to two different words in the same synset which differ only by + // capitalization. Luckily, none of these exceptions are verbs, so we + // can pretend that the mapping is injective. + if (!wnSenseKeys_.count(sense_key)) + { + std::pair lookup(synset_id, wnum); + if (wordByWnidAndWnum_.count(lookup)) { - db_error(ppdb, query); + wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); } - - sqlite3_finalize(ppstmt); } } } - } - - // While we're working on s - { - progress ppgs("Writing word synonyms...", wn.size()); - for (auto sense : wn) + + void generator::readVerbNet() { - ppgs.update(); - - for (auto word1 : sense.second) + std::cout << "Reading frames from VerbNet..." << std::endl; + + DIR* dir; + if ((dir = opendir(verbNetPath_.c_str())) == nullptr) + { + throw std::invalid_argument("Invalid VerbNet data directory"); + } + + struct dirent* ent; + while ((ent = readdir(dir)) != nullptr) { - for (auto word2 : sense.second) + std::string filename(verbNetPath_); + + if (filename.back() != '/') { - if (word1 != word2) - { - std::string query; - switch (sense.first / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; - } - - case 2: // Verb - { - // Ignore - - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; - - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - - break; - } - } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + filename += '/'; + } - sqlite3_bind_int(ppstmt, 1, word1.second); - sqlite3_bind_int(ppstmt, 2, word2.second); + filename += ent->d_name; - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + if (filename.rfind(".xml") != filename.size() - 4) + { + continue; + } - sqlite3_finalize(ppstmt); - } + xmlDocPtr doc = xmlParseFile(filename.c_str()); + if (doc == nullptr) + { + throw std::logic_error("Error opening " + filename); } - } - } - } - - // ant table - { - std::ifstream wnantfile(wnpref + "wn_ant.pl"); - if (!wnantfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnantfile, line)) - { - break; - } - if (line.back() == '\r') - { - line.pop_back(); + xmlNodePtr top = xmlDocGetRootElement(doc); + if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast("VNCLASS")))) + { + throw std::logic_error("Bad VerbNet file format: " + filename); + } + + try + { + createGroup(top); + } catch (const std::exception& e) + { + std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); + } } - - lines.push_back(line); + + closedir(dir); } - progress ppgs("Writing antonyms...", lines.size()); - for (auto line : lines) + void generator::readAgidInflections() { - ppgs.update(); + std::list lines(readFile(agidPath_)); + progress ppgs("Reading inflections from AGID...", lines.size()); - std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + for (std::string line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); + ppgs.update(); + + int divider = line.find_first_of(" "); + std::string infinitive = line.substr(0, divider); + line = line.substr(divider+1); + char type = line[0]; - std::string query; - switch (synset_id_1 / 100000000) - { - case 1: // Noun + if (line[1] == '?') { - query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; - - break; + line.erase(0, 4); + } else { + line.erase(0, 3); } - - case 2: // Verb + + if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) { - // Ignore + continue; + } - break; + lemma& curLemma = lookupOrCreateLemma(infinitive); + + auto forms = split>(line, " | "); + for (std::string& inflForm : forms) + { + int sympos = inflForm.find_first_of(",?"); + if (sympos != std::string::npos) + { + inflForm = inflForm.substr(0, sympos); + } } - - case 3: // Adjective + + switch (type) { - query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; + case 'V': + { + if (forms.size() == 4) + { + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); + } else if (forms.size() == 3) + { + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); + } else if (forms.size() == 8) + { + // As of AGID 2014.08.11, this is only "to be" + curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); + curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); + curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); + } else { + // Words that don't fit the cases above as of AGID 2014.08.11: + // - may and shall do not conjugate the way we want them to + // - methinks only has a past tense and is an outlier + // - wit has five forms, and is archaic/obscure enough that we can ignore it for now + std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } + + // For verbs in particular, we sometimes create a notion and a word + // from inflection data. Specifically, if there are not yet any + // verbs existing that have the same infinitive form. "Yet" means + // that this verb appears in the AGID data but not in either WordNet + // or VerbNet. + if (!wordsByBaseForm_.count(infinitive) + || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { + return w->getNotion().getPartOfSpeech() == part_of_speech::verb; + })) + { + notion& n = createNotion(part_of_speech::verb); + createWord(n, curLemma); + } - break; - } + break; + } - case 4: // Adverb - { - query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; + case 'A': + { + if (forms.size() == 2) + { + curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); + curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); + } else { + // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" + std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } + + break; + } + + case 'N': + { + if (forms.size() == 1) + { + curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); + } else { + // As of AGID 2014.08.11, this is non-existent. + std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; + } - break; + break; + } } } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } - - sqlite3_finalize(ppstmt); } - } - - // at table - { - std::ifstream wnatfile(wnpref + "wn_at.pl"); - if (!wnatfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnatfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readPrepositions() + { + std::list lines(readFile("prepositions.txt")); + progress ppgs("Reading prepositions...", lines.size()); + + for (std::string line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^([^:]+): (.+)"); + std::smatch relation_data; + std::regex_search(line, relation_data, relation); + std::string prep = relation_data[1]; + auto groups = split>(relation_data[2], ", "); + + notion& n = createNotion(part_of_speech::preposition); + lemma& l = lookupOrCreateLemma(prep); + word& w = createWord(n, l); + + n.setPrepositionGroups(groups); } - - lines.push_back(line); } - progress ppgs("Writing variations...", lines.size()); - for (auto line : lines) + void generator::readCmudictPronunciations() { - ppgs.update(); - - std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); + std::list lines(readFile(cmudictPath_)); + progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); - for (auto mapping1 : wn[synset_id_1]) + for (std::string line : lines) { - for (auto mapping2 : wn[synset_id_2]) + ppgs.update(); + + std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); + std::smatch phoneme_data; + if (std::regex_search(line, phoneme_data, phoneme)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + std::string canonical(phoneme_data[1]); + std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (!formByText_.count(canonical)) { - db_error(ppdb, query); + continue; } - sqlite3_finalize(ppstmt); + std::string phonemes = phoneme_data[2]; + pronunciations_.emplace_back(phonemes); + pronunciation& p = pronunciations_.back(); + formByText_.at(canonical)->addPronunciation(p); } } } - } - - // der table - { - std::ifstream wnderfile(wnpref + "wn_der.pl"); - if (!wnderfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - std::list lines; - for (;;) + void generator::writeSchema() { - std::string line; - if (!getline(wnderfile, line)) + std::ifstream file("schema.sql"); + if (!file) { - break; + throw std::invalid_argument("Could not find database schema"); } - - if (line.back() == '\r') + + std::ostringstream schemaBuilder; + std::string line; + while (std::getline(file, line)) { - line.pop_back(); + if (line.back() == '\r') + { + line.pop_back(); + } + + schemaBuilder << line; } - lines.push_back(line); + std::string schema = schemaBuilder.str(); + auto queries = split>(schema, ";"); + progress ppgs("Writing database schema...", queries.size()); + for (std::string query : queries) + { + if (!queries.empty()) + { + db_.runQuery(query); + } + + ppgs.update(); + } } - progress ppgs("Writing morphological derivation...", lines.size()); - for (auto line : lines) + void generator::dumpObjects() { - ppgs.update(); - - std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) { - continue; + progress ppgs("Writing notions...", notions_.size()); + + for (notion& n : notions_) + { + db_ << n; + + ppgs.update(); + } } - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query; - switch (synset_id_1 / 100000000) { - case 1: // Noun + progress ppgs("Writing words...", words_.size()); + + for (word& w : words_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; - break; - } - } + db_ << w; - break; + ppgs.update(); } + } + + { + progress ppgs("Writing lemmas...", lemmas_.size()); - case 3: // Adjective + for (lemma& l : lemmas_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; - break; - } - } + db_ << l; - break; + ppgs.update(); } + } + + { + progress ppgs("Writing forms...", forms_.size()); - case 4: // Adverb + for (form& f : forms_) { - switch (synset_id_2 / 100000000) - { - case 1: // Noun - { - query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; - break; - } - - case 3: // Adjective - { - query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; - break; - } - - case 4: // Adverb - { - query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; - break; - } - } + db_ << f; - break; + ppgs.update(); } } - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) { - db_error(ppdb, query); + progress ppgs("Writing pronunciations...", pronunciations_.size()); + + for (pronunciation& p : pronunciations_) + { + db_ << p; + + ppgs.update(); + } } - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) { - db_error(ppdb, query); + progress ppgs("Writing verb groups...", groups_.size()); + + for (group& g : groups_) + { + db_ << g; + + ppgs.update(); + } } - sqlite3_finalize(ppstmt); - } - } - - // hyp table - { - std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); - if (!wnhypfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnhypfile, line)) { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); + progress ppgs("Writing verb frames...", frames_.size()); + + for (frame& f : frames_) + { + db_ << f; + + ppgs.update(); + } } - - lines.push_back(line); } - progress ppgs("Writing hypernyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetAntonymy() { - ppgs.update(); - - std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_ant.pl")); + progress ppgs("Writing antonyms...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("antonym_1_id", word1.getId()); + fields.emplace_back("antonym_2_id", word2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("antonymy", std::move(fields)); } } } - } - - // ins table - { - std::ifstream wninsfile(wnpref + "wn_ins.pl"); - if (!wninsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wninsfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetVariation() + { + std::list lines(readFile(wordNetPath_ + "wn_at.pl")); + progress ppgs("Writing variation...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("noun_id", notion1.getId()); + fields.emplace_back("adjective_id", notion2.getId()); + + db_.insertIntoTable("variation", std::move(fields)); + } + } } - progress ppgs("Writing instantiations...", lines.size()); - for (auto line : lines) + void generator::readWordNetClasses() { - ppgs.update(); - - std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_cls.pl")); + progress ppgs("Writing usage, topicality, and regionality...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); - - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + std::string class_type = relation_data[5]; + + std::string table_name; + if (class_type == "t") + { + table_name += "topicality"; + } else if (class_type == "u") + { + table_name += "usage"; + } else if (class_type == "r") + { + table_name += "regionality"; + } + + std::list leftJoin; + std::list rightJoin; + + if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) + { + std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { + return w->getId(); + }); + } else if (wordByWnidAndWnum_.count(lookup1)) { + leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); + } + + if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) + { + std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { + return w->getId(); + }); + } else if (wordByWnidAndWnum_.count(lookup2)) { + rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); + } + + for (int word1 : leftJoin) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + for (int word2 : rightJoin) { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + std::list fields; + fields.emplace_back("term_id", word1); + fields.emplace_back("domain_id", word2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); + db_.insertIntoTable(table_name, std::move(fields)); } - - sqlite3_finalize(ppstmt); } } } - } - - // mm table - { - std::ifstream wnmmfile(wnpref + "wn_mm.pl"); - if (!wnmmfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnmmfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetCausality() + { + std::list lines(readFile(wordNetPath_ + "wn_cs.pl")); + progress ppgs("Writing causality...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("effect_id", notion1.getId()); + fields.emplace_back("cause_id", notion2.getId()); + + db_.insertIntoTable("causality", std::move(fields)); + } + } } - progress ppgs("Writing member meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetEntailment() { - ppgs.update(); - - std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_ent.pl")); + progress ppgs("Writing entailment...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("given_id", notion1.getId()); + fields.emplace_back("entailment_id", notion2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("entailment", std::move(fields)); } } } - } - - // ms table - { - std::ifstream wnmsfile(wnpref + "wn_ms.pl"); - if (!wnmsfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) + + void generator::readWordNetHypernymy() { - std::string line; - if (!getline(wnmsfile, line)) + std::list lines(readFile(wordNetPath_ + "wn_hyp.pl")); + progress ppgs("Writing hypernymy...", lines.size()); + for (auto line : lines) { - break; + ppgs.update(); + + std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("hyponym_id", notion1.getId()); + fields.emplace_back("hypernym_id", notion2.getId()); + + db_.insertIntoTable("hypernymy", std::move(fields)); + } } + } - if (line.back() == '\r') + void generator::readWordNetInstantiation() + { + std::list lines(readFile(wordNetPath_ + "wn_ins.pl")); + progress ppgs("Writing instantiation...", lines.size()); + for (auto line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("instance_id", notion1.getId()); + fields.emplace_back("class_id", notion2.getId()); + + db_.insertIntoTable("instantiation", std::move(fields)); + } } - - lines.push_back(line); } - progress ppgs("Writing substance meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetMemberMeronymy() { - ppgs.update(); - - std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_mm.pl")); + progress ppgs("Writing member meronymy...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - if (sqlite3_step(ppstmt) != SQLITE_DONE) - { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("member_meronymy", std::move(fields)); } } } - } - - // mm table - { - std::ifstream wnmpfile(wnpref + "wn_mp.pl"); - if (!wnmpfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) + + void generator::readWordNetPartMeronymy() { - std::string line; - if (!getline(wnmpfile, line)) + std::list lines(readFile(wordNetPath_ + "wn_mp.pl")); + progress ppgs("Writing part meronymy...", lines.size()); + for (auto line : lines) { - break; + ppgs.update(); + + std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); + + db_.insertIntoTable("part_meronymy", std::move(fields)); + } } + } - if (line.back() == '\r') + void generator::readWordNetSubstanceMeronymy() + { + std::list lines(readFile(wordNetPath_ + "wn_ms.pl")); + progress ppgs("Writing substance meronymy...", lines.size()); + for (auto line : lines) { - line.pop_back(); - } + ppgs.update(); - lines.push_back(line); + std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); + + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) + { + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); + + std::list fields; + fields.emplace_back("holonym_id", notion1.getId()); + fields.emplace_back("meronym_id", notion2.getId()); + + db_.insertIntoTable("substance_meronymy", std::move(fields)); + } + } } - progress ppgs("Writing part meronyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetPertainymy() { - ppgs.update(); - - std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_per.pl")); + progress ppgs("Writing pertainymy and mannernymy...", lines.size()); + for (auto line : lines) { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); + ppgs.update(); - for (auto mapping1 : wn[synset_id_1]) - { - for (auto mapping2 : wn[synset_id_2]) + std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); + if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) + { + std::list fields; + fields.emplace_back("pertainym_id", word1.getId()); + fields.emplace_back("noun_id", word2.getId()); - if (sqlite3_step(ppstmt) != SQLITE_DONE) + db_.insertIntoTable("pertainymy", std::move(fields)); + } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) { - db_error(ppdb, query); - } + std::list fields; + fields.emplace_back("mannernym_id", word1.getId()); + fields.emplace_back("adjective_id", word2.getId()); - sqlite3_finalize(ppstmt); + db_.insertIntoTable("mannernymy", std::move(fields)); + } } } } - } - - // per table - { - std::ifstream wnperfile(wnpref + "wn_per.pl"); - if (!wnperfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnperfile, line)) - { - break; - } - if (line.back() == '\r') + void generator::readWordNetSpecification() + { + std::list lines(readFile(wordNetPath_ + "wn_sa.pl")); + progress ppgs("Writing specifications...", lines.size()); + for (auto line : lines) { - line.pop_back(); + ppgs.update(); + + std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) + { + continue; + } + + std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); + std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); + + if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) + { + word& word1 = *wordByWnidAndWnum_.at(lookup1); + word& word2 = *wordByWnidAndWnum_.at(lookup2); + + std::list fields; + fields.emplace_back("general_id", word1.getId()); + fields.emplace_back("specific_id", word2.getId()); + + db_.insertIntoTable("specification", std::move(fields)); + } } - - lines.push_back(line); } - progress ppgs("Writing pertainyms and mannernyms...", lines.size()); - for (auto line : lines) + void generator::readWordNetSimilarity() { - ppgs.update(); - - std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + std::list lines(readFile(wordNetPath_ + "wn_sim.pl")); + progress ppgs("Writing adjective similarity...", lines.size()); + for (auto line : lines) { - continue; - } + ppgs.update(); - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query; - switch (synset_id_1 / 100000000) - { - case 3: // Adjective + std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { - // This is a pertainym, the second word should be a noun - // Technically it can be an adjective but we're ignoring that - if (synset_id_2 / 100000000 != 1) - { - continue; - } - - query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; - - break; + continue; } + + int lookup1 = std::stoi(relation_data[1]); + int lookup2 = std::stoi(relation_data[2]); - case 4: // Adverb + if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) { - // This is a mannernym, the second word should be an adjective - if (synset_id_2 / 100000000 != 3) - { - continue; - } + notion& notion1 = *notionByWnid_.at(lookup1); + notion& notion2 = *notionByWnid_.at(lookup2); - query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; + std::list fields; + fields.emplace_back("adjective_1_id", notion1.getId()); + fields.emplace_back("adjective_2_id", notion2.getId()); - break; + db_.insertIntoTable("similarity", std::move(fields)); } } - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) - { - db_error(ppdb, query); - } - - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) + std::list generator::readFile(std::string path) + { + std::ifstream file(path); + if (!file) { - db_error(ppdb, query); + throw std::invalid_argument("Could not find file " + path); } - - sqlite3_finalize(ppstmt); - } - } - // sa table - { - std::ifstream wnsafile(wnpref + "wn_sa.pl"); - if (!wnsafile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - - std::list lines; - for (;;) - { + std::list lines; std::string line; - if (!getline(wnsafile, line)) - { - break; - } - - if (line.back() == '\r') + while (std::getline(file, line)) { - line.pop_back(); + if (line.back() == '\r') + { + line.pop_back(); + } + + lines.push_back(line); } - lines.push_back(line); + return lines; } - progress ppgs("Writing specifications...", lines.size()); - for (auto line : lines) + part_of_speech generator::partOfSpeechByWnid(int wnid) { - ppgs.update(); - - std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id_1 = stoi(relation_data[1]); - int wnum_1 = stoi(relation_data[2]); - int synset_id_2 = stoi(relation_data[3]); - int wnum_2 = stoi(relation_data[4]); - std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) + switch (wnid / 100000000) { - db_error(ppdb, query); + case 1: return part_of_speech::noun; + case 2: return part_of_speech::verb; + case 3: return part_of_speech::adjective; + case 4: return part_of_speech::adverb; + default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); } + } - sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); - sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); + notion& generator::createNotion(part_of_speech partOfSpeech) + { + notions_.emplace_back(partOfSpeech); + + return notions_.back(); + } - if (sqlite3_step(ppstmt) != SQLITE_DONE) + notion& generator::lookupOrCreateNotion(int wnid) + { + if (!notionByWnid_.count(wnid)) { - db_error(ppdb, query); + notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); + notionByWnid_[wnid] = ¬ions_.back(); } - - sqlite3_finalize(ppstmt); - } - } - - // sim table - { - std::ifstream wnsimfile(wnpref + "wn_sim.pl"); - if (!wnsimfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); + + return *notionByWnid_.at(wnid); } - - std::list lines; - for (;;) + + lemma& generator::lookupOrCreateLemma(std::string base_form) { - std::string line; - if (!getline(wnsimfile, line)) + if (!lemmaByBaseForm_.count(base_form)) { - break; + lemmas_.emplace_back(lookupOrCreateForm(base_form)); + lemmaByBaseForm_[base_form] = &lemmas_.back(); } + + return *lemmaByBaseForm_.at(base_form); + } - if (line.back() == '\r') + form& generator::lookupOrCreateForm(std::string text) + { + if (!formByText_.count(text)) { - line.pop_back(); + forms_.emplace_back(text); + formByText_[text] = &forms_.back(); } - lines.push_back(line); + return *formByText_[text]; } - progress ppgs("Writing sense synonyms...", lines.size()); - for (auto line : lines) + template word& generator::createWord(Args&&... args) { - ppgs.update(); + words_.emplace_back(std::forward(args)...); + word& w = words_.back(); - std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) + wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); + + if (w.getNotion().hasWnid()) { - continue; + wordsByWnid_[w.getNotion().getWnid()].insert(&w); } - int synset_id_1 = stoi(relation_data[1]); - int synset_id_2 = stoi(relation_data[2]); - std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); + return w; + } + + group& generator::createGroup(xmlNodePtr top) + { + groups_.emplace_back(); + group& grp = groups_.back(); - for (auto mapping1 : wn[synset_id_1]) + xmlChar* key; + + for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) { - for (auto mapping2 : wn[synset_id_2]) + if (!xmlStrcmp(node->name, reinterpret_cast("SUBCLASSES"))) { - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) { - db_error(ppdb, query); + if (!xmlStrcmp(subclass->name, reinterpret_cast("VNSUBCLASS"))) + { + try + { + group& subgrp = createGroup(subclass); + subgrp.setParent(grp); + } catch (const std::exception& e) + { + key = xmlGetProp(subclass, reinterpret_cast("ID")); + + if (key == nullptr) + { + std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); + } else { + std::string subgroupId(reinterpret_cast(key)); + xmlFree(key); + + std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); + } + } + } } - - sqlite3_bind_int(ppstmt, 1, mapping1.second); - sqlite3_bind_int(ppstmt, 2, mapping2.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + } else if (!xmlStrcmp(node->name, reinterpret_cast("MEMBERS"))) + { + for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) { - db_error(ppdb, query); + if (!xmlStrcmp(member->name, reinterpret_cast("MEMBER"))) + { + key = xmlGetProp(member, reinterpret_cast("wn")); + std::string wnSenses(reinterpret_cast(key)); + xmlFree(key); + + auto wnSenseKeys = split>(wnSenses, " "); + if (!wnSenseKeys.empty()) + { + std::list tempKeys; + + std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { + return sense + "::"; + }); + + std::list filteredKeys; + + std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { + return !wnSenseKeys_.count(sense); + }); + + wnSenseKeys = std::move(filteredKeys); + } + + if (!wnSenseKeys.empty()) + { + for (std::string sense : wnSenseKeys) + { + word& wordSense = *wnSenseKeys_[sense]; + wordSense.setVerbGroup(grp); + } + } else { + key = xmlGetProp(member, reinterpret_cast("name")); + std::string memberName(reinterpret_cast(key)); + xmlFree(key); + + notion& n = createNotion(part_of_speech::verb); + lemma& l = lookupOrCreateLemma(memberName); + word& w = createWord(n, l); + + w.setVerbGroup(grp); + } + } } - - sqlite3_reset(ppstmt); - sqlite3_clear_bindings(ppstmt); - - sqlite3_bind_int(ppstmt, 1, mapping2.second); - sqlite3_bind_int(ppstmt, 2, mapping1.second); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + } else if (!xmlStrcmp(node->name, reinterpret_cast("THEMROLES"))) + { + for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) { - db_error(ppdb, query); + if (!xmlStrcmp(roletopnode->name, reinterpret_cast("THEMROLE"))) + { + role r; + + key = xmlGetProp(roletopnode, reinterpret_cast("type")); + std::string roleName = reinterpret_cast(key); + xmlFree(key); + + for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) + { + if (!xmlStrcmp(rolenode->name, reinterpret_cast("SELRESTRS"))) + { + r.setSelrestrs(parseSelrestr(rolenode)); + } + } + + grp.addRole(roleName, std::move(r)); + } } + } else if (!xmlStrcmp(node->name, reinterpret_cast("FRAMES"))) + { + for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) + { + if (!xmlStrcmp(frametopnode->name, reinterpret_cast("FRAME"))) + { + frames_.emplace_back(); + frame& fr = frames_.back(); - sqlite3_finalize(ppstmt); + for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) + { + if (!xmlStrcmp(framenode->name, reinterpret_cast("SYNTAX"))) + { + for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) + { + if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("NP"))) + { + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string partRole = reinterpret_cast(key); + xmlFree(key); + + selrestr partSelrestrs; + std::set partSynrestrs; + + for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) + { + if (!xmlStrcmp(npnode->name, reinterpret_cast("SYNRESTRS"))) + { + for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) + { + if (!xmlStrcmp(synrestr->name, reinterpret_cast("SYNRESTR"))) + { + key = xmlGetProp(synrestr, reinterpret_cast("type")); + partSynrestrs.insert(reinterpret_cast(key)); + xmlFree(key); + } + } + } + + if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) + { + partSelrestrs = parseSelrestr(npnode); + } + } + + fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("VERB"))) + { + fr.push_back(part::createVerb()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("PREP"))) + { + std::set partChoices; + bool partLiteral; + + if (xmlHasProp(syntaxnode, reinterpret_cast("value"))) + { + partLiteral = true; + + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string choicesStr = reinterpret_cast(key); + xmlFree(key); + + split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); + } else { + partLiteral = false; + + for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) + { + if (!xmlStrcmp(npnode->name, reinterpret_cast("SELRESTRS"))) + { + for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) + { + if (!xmlStrcmp(synrestr->name, reinterpret_cast("SELRESTR"))) + { + key = xmlGetProp(synrestr, reinterpret_cast("type")); + partChoices.insert(reinterpret_cast(key)); + xmlFree(key); + } + } + } + } + } + + fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("ADJ"))) + { + fr.push_back(part::createAdjective()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("ADV"))) + { + fr.push_back(part::createAdverb()); + } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast("LEX"))) + { + key = xmlGetProp(syntaxnode, reinterpret_cast("value")); + std::string literalValue = reinterpret_cast(key); + xmlFree(key); + + fr.push_back(part::createLiteral(literalValue)); + } else { + continue; + } + } + + grp.addFrame(fr); + } + } + } + } } } - } - } - - // syntax table - { - std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); - if (!wnsyntaxfile.is_open()) - { - std::cout << "Invalid WordNet data directory." << std::endl; - print_usage(); - } - std::list lines; - for (;;) - { - std::string line; - if (!getline(wnsyntaxfile, line)) - { - break; - } - - if (line.back() == '\r') - { - line.pop_back(); - } - - lines.push_back(line); + return grp; } - progress ppgs("Writing adjective syntax markers...", lines.size()); - for (auto line : lines) + selrestr generator::parseSelrestr(xmlNodePtr top) { - ppgs.update(); - - std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); - std::smatch relation_data; - if (!std::regex_search(line, relation_data, relation)) - { - continue; - } - - int synset_id = stoi(relation_data[1]); - int wnum = stoi(relation_data[2]); - std::string syn = relation_data[3]; - std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); - - sqlite3_stmt* ppstmt; - if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) + xmlChar* key; + + if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTRS"))) { - db_error(ppdb, query); - } - - sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); - sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); - - if (sqlite3_step(ppstmt) != SQLITE_DONE) + if (xmlChildElementCount(top) == 0) + { + return {}; + } else if (xmlChildElementCount(top) == 1) + { + return parseSelrestr(xmlFirstElementChild(top)); + } else { + bool orlogic = false; + if (xmlHasProp(top, reinterpret_cast("logic"))) + { + key = xmlGetProp(top, reinterpret_cast("logic")); + if (!xmlStrcmp(key, reinterpret_cast("or"))) + { + orlogic = true; + } + + xmlFree(key); + } + + std::list children; + for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) + { + if (!xmlStrcmp(selrestr->name, reinterpret_cast("SELRESTRS")) + || !xmlStrcmp(selrestr->name, reinterpret_cast("SELRESTR"))) + { + children.push_back(parseSelrestr(selrestr)); + } + } + + return selrestr(children, orlogic); + } + } else if (!xmlStrcmp(top->name, reinterpret_cast("SELRESTR"))) { - db_error(ppdb, query); + key = xmlGetProp(top, reinterpret_cast("Value")); + bool selPos = (std::string(reinterpret_cast(key)) == "+"); + xmlFree(key); + + key = xmlGetProp(top, reinterpret_cast("type")); + std::string selRestriction = reinterpret_cast(key); + xmlFree(key); + + return selrestr(selRestriction, selPos); + } else { + throw std::logic_error("Badly formatted selrestr"); } - - sqlite3_finalize(ppstmt); } - } - - sqlite3_close_v2(ppdb); - - std::cout << "Done." << std::endl; -} + + }; +}; diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h @@ -0,0 +1,151 @@ +#ifndef GENERATOR_H_5B61CBC5 +#define GENERATOR_H_5B61CBC5 + +#include +#include +#include +#include +#include +#include "database.h" +#include "notion.h" +#include "word.h" +#include "lemma.h" +#include "form.h" +#include "pronunciation.h" +#include "group.h" +#include "frame.h" + +namespace verbly { + namespace generator { + + enum class part_of_speech; + class selrestr; + + class generator { + public: + + // Constructor + + generator( + std::string verbNetPath, + std::string agidPath, + std::string wordNetPath, + std::string cmudictPath, + std::string imageNetPath, + std::string outputPath); + + // Action + + void run(); + + private: + + // Subroutines + + void readWordNetSynsets(); + + void readAdjectivePositioning(); + + void readImageNetUrls(); + + void readWordNetSenseKeys(); + + void readVerbNet(); + + void readAgidInflections(); + + void readPrepositions(); + + void readCmudictPronunciations(); + + void writeSchema(); + + void dumpObjects(); + + void readWordNetAntonymy(); + + void readWordNetVariation(); + + void readWordNetClasses(); + + void readWordNetCausality(); + + void readWordNetEntailment(); + + void readWordNetHypernymy(); + + void readWordNetInstantiation(); + + void readWordNetMemberMeronymy(); + + void readWordNetPartMeronymy(); + + void readWordNetSubstanceMeronymy(); + + void readWordNetPertainymy(); + + void readWordNetSpecification(); + + void readWordNetSimilarity(); + + // Helpers + + std::list readFile(std::string path); + + inline part_of_speech partOfSpeechByWnid(int wnid); + + notion& createNotion(part_of_speech partOfSpeech); + + notion& lookupOrCreateNotion(int wnid); + + lemma& lookupOrCreateLemma(std::string base_form); + + form& lookupOrCreateForm(std::string text); + + template word& createWord(Args&&... args); + + group& createGroup(xmlNodePtr top); + + selrestr parseSelrestr(xmlNodePtr top); + + // Input + + std::string verbNetPath_; + std::string agidPath_; + std::string wordNetPath_; + std::string cmudictPath_; + std::string imageNetPath_; + + // Output + + database db_; + + // Data + + std::list notions_; + std::list words_; + std::list lemmas_; + std::list
forms_; + std::list pronunciations_; + std::list frames_; + std::list groups_; + + // Indexes + + std::map notionByWnid_; + std::map> wordsByWnid_; + std::map, word*> wordByWnidAndWnum_; + std::map> wordsByBaseForm_; + std::map lemmaByBaseForm_; + std::map formByText_; + + // Caches + + std::map wnSenseKeys_; + + }; + + }; +}; + +#endif /* end of include guard: GENERATOR_H_5B61CBC5 */ diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp @@ -0,0 +1,119 @@ +#include "group.h" +#include +#include +#include +#include "database.h" +#include "field.h" +#include "frame.h" + +namespace verbly { + namespace generator { + + int group::nextId_ = 0; + + group::group() : id_(nextId_++) + { + } + + void group::setParent(const group& parent) + { + // Adding a group to itself is nonsensical. + assert(&parent != this); + + parent_ = &parent; + } + + void group::addRole(std::string name, role r) + { + roleNames_.insert(name); + roles_[name] = std::move(r); + } + + void group::addFrame(const frame& f) + { + frames_.insert(&f); + } + + std::set group::getRoles() const + { + std::set fullRoles = roleNames_; + + if (hasParent()) + { + for (std::string name : getParent().getRoles()) + { + fullRoles.insert(name); + } + } + + return fullRoles; + } + + const role& group::getRole(std::string name) const + { + if (roles_.count(name)) + { + return roles_.at(name); + } else if (hasParent()) + { + return getParent().getRole(name); + } else { + throw std::invalid_argument("Specified role not found in verb group"); + } + } + + std::set group::getFrames() const + { + std::set fullFrames = frames_; + + if (hasParent()) + { + for (const frame* f : getParent().getFrames()) + { + fullFrames.insert(f); + } + } + + return fullFrames; + } + + database& operator<<(database& db, const group& arg) + { + // Serialize the group first + { + std::list fields; + fields.emplace_back("group_id", arg.getId()); + + nlohmann::json jsonRoles; + for (std::string name : arg.getRoles()) + { + const role& r = arg.getRole(name); + + nlohmann::json jsonRole; + jsonRole["type"] = name; + jsonRole["selrestrs"] = r.getSelrestrs().toJson(); + + jsonRoles.emplace_back(std::move(jsonRole)); + } + + fields.emplace_back("data", jsonRoles.dump()); + + db.insertIntoTable("groups", std::move(fields)); + } + + // Then, serialize the group/frame relationship + for (const frame* f : arg.getFrames()) + { + std::list fields; + + fields.emplace_back("group_id", arg.getId()); + fields.emplace_back("frame_id", f->getId()); + + db.insertIntoTable("groups_frames", std::move(fields)); + } + + return db; + } + + }; +}; diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h @@ -0,0 +1,80 @@ +#ifndef GROUP_H_EDAFB5DC +#define GROUP_H_EDAFB5DC + +#include +#include +#include +#include +#include "role.h" + +namespace verbly { + namespace generator { + + class frame; + class database; + + class group { + public: + + // Constructor + + group(); + + // Mutators + + void setParent(const group& parent); + + void addRole(std::string name, role r); + + void addFrame(const frame& f); + + // Accessors + + int getId() const + { + return id_; + } + + bool hasParent() const + { + return (parent_ != nullptr); + } + + const group& getParent() const + { + // Calling code should always call hasParent first + assert(parent_ != nullptr); + + return *parent_; + } + + std::set getRoles() const; + + const role& getRole(std::string name) const; + + std::set getFrames() const; + + private: + + static int nextId_; + + const int id_; + + const group* parent_ = nullptr; + std::map roles_; + std::set frames_; + + // Caches + + std::set roleNames_; + + }; + + // Serializer + + database& operator<<(database& db, const group& arg); + + }; +}; + +#endif /* end of include guard: GROUP_H_EDAFB5DC */ diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp @@ -0,0 +1,65 @@ +#include "lemma.h" +#include +#include +#include "field.h" +#include "database.h" +#include "form.h" + +namespace verbly { + namespace generator { + + int lemma::nextId_ = 0; + + lemma::lemma(const form& baseForm) : + id_(nextId_++), + baseForm_(baseForm) + { + inflections_[inflection::base] = {&baseForm}; + } + + void lemma::addInflection(inflection type, const form& f) + { + // There can only be one base form. + assert(type != inflection::base); + + inflections_[type].insert(&f); + } + + std::set lemma::getInflections(inflection type) const + { + if (inflections_.count(type)) + { + return inflections_.at(type); + } else { + return {}; + } + } + + database& operator<<(database& db, const lemma& arg) + { + for (inflection type : { + inflection::base, + inflection::plural, + inflection::comparative, + inflection::superlative, + inflection::past_tense, + inflection::past_participle, + inflection::ing_form, + inflection::s_form}) + { + for (const form* f : arg.getInflections(type)) + { + std::list fields; + fields.emplace_back("lemma_id", arg.getId()); + fields.emplace_back("form_id", f->getId()); + fields.emplace_back("category", static_cast(type)); + + db.insertIntoTable("lemmas_forms", std::move(fields)); + } + } + + return db; + } + + }; +}; diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h @@ -0,0 +1,58 @@ +#ifndef LEMMA_H_D73105A7 +#define LEMMA_H_D73105A7 + +#include +#include +#include +#include "enums.h" + +namespace verbly { + namespace generator { + + class database; + class form; + + class lemma { + public: + + // Constructors + + explicit lemma(const form& baseForm); + + // Mutators + + void addInflection(inflection type, const form& f); + + // Accessors + + int getId() const + { + return id_; + } + + const form& getBaseForm() const + { + return baseForm_; + } + + std::set getInflections(inflection type) const; + + private: + + static int nextId_; + + const int id_; + const form& baseForm_; + + std::map> inflections_; + + }; + + // Serializer + + database& operator<<(database& db, const lemma& arg); + + }; +}; + +#endif /* end of include guard: LEMMA_H_D73105A7 */ diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp @@ -0,0 +1,40 @@ +#include +#include +#include "generator.h" + +void printUsage() +{ + std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl; + std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; + std::cout << "agid :: path to an AGID infl.txt file" << std::endl; + std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; + std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; + std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; + std::cout << "output :: datafile output path" << std::endl; +} + +int main(int argc, char** argv) +{ + if (argc == 7) + { + try + { + verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); + + try + { + app.run(); + } catch (const std::exception& e) + { + std::cout << e.what() << std::endl; + } + } catch (const std::exception& e) + { + std::cout << e.what() << std::endl; + printUsage(); + } + } else { + std::cout << "verbly datafile generator" << std::endl; + printUsage(); + } +} diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp @@ -0,0 +1,85 @@ +#include "notion.h" +#include +#include +#include "database.h" +#include "field.h" + +namespace verbly { + namespace generator { + + int notion::nextId_ = 0; + + notion::notion( + part_of_speech partOfSpeech) : + id_(nextId_++), + partOfSpeech_(partOfSpeech) + { + } + + notion::notion( + part_of_speech partOfSpeech, + int wnid) : + id_(nextId_++), + partOfSpeech_(partOfSpeech), + wnid_(wnid), + hasWnid_(true) + { + } + + void notion::incrementNumOfImages() + { + // Calling code should always call hasWnid and check that the notion is a noun first. + assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); + + numOfImages_++; + } + + void notion::setPrepositionGroups(std::list groups) + { + // Calling code should always check that the notion is a preposition first. + assert(partOfSpeech_ == part_of_speech::preposition); + + prepositionGroups_ = groups; + } + + database& operator<<(database& db, const notion& arg) + { + // First, serialize the notion + { + std::list fields; + + fields.emplace_back("notion_id", arg.getId()); + fields.emplace_back("part_of_speech", static_cast(arg.getPartOfSpeech())); + + if (arg.hasWnid()) + { + fields.emplace_back("wnid", arg.getWnid()); + + if (arg.getPartOfSpeech() == part_of_speech::noun) + { + fields.emplace_back("images", arg.getNumOfImages()); + } + } + + db.insertIntoTable("notions", std::move(fields)); + } + + // Next, serialize the is_a relationship if this is a preposition + if (arg.getPartOfSpeech() == part_of_speech::preposition) + { + for (std::string group : arg.getPrepositionGroups()) + { + std::list fields; + + fields.emplace_back("notion_id", arg.getId()); + fields.emplace_back("groupname", group); + + db.insertIntoTable("is_a", std::move(fields)); + } + } + + return db; + } + + }; +}; diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h @@ -0,0 +1,91 @@ +#ifndef NOTION_H_221DE2BC +#define NOTION_H_221DE2BC + +#include +#include +#include +#include "enums.h" + +namespace verbly { + namespace generator { + + class database; + + class notion { + public: + + // Constructors + + explicit notion(part_of_speech partOfSpeech); + + notion(part_of_speech partOfSpeech, int wnid); + + // Mutators + + void incrementNumOfImages(); + + void setPrepositionGroups(std::list groups); + + // Accessors + + int getId() const + { + return id_; + } + + part_of_speech getPartOfSpeech() const + { + return partOfSpeech_; + } + + bool hasWnid() const + { + return hasWnid_; + } + + int getWnid() const + { + // Calling code should always call hasWnid first. + assert(hasWnid_); + + return wnid_; + } + + int getNumOfImages() const + { + // Calling code should always call hasWnid and check that the notion is a noun first. + assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); + + return numOfImages_; + } + + std::list getPrepositionGroups() const + { + // Calling code should always check that the notion is a preposition first. + assert(partOfSpeech_ == part_of_speech::preposition); + + return prepositionGroups_; + } + + private: + + static int nextId_; + + const int id_; + const part_of_speech partOfSpeech_; + const int wnid_ = 0; + const bool hasWnid_ = false; + + int numOfImages_ = 0; + std::list prepositionGroups_; + + }; + + // Serializer + + database& operator<<(database& db, const notion& arg); + + }; +}; + +#endif /* end of include guard: NOTION_H_221DE2BC */ diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp @@ -0,0 +1,336 @@ +#include "part.h" +#include +#include "selrestr.h" + +namespace verbly { + namespace generator { + + part part::createNounPhrase(std::string role, selrestr selrestrs, std::set synrestrs) + { + part p(type::noun_phrase); + + new(&p.noun_phrase_.role) std::string(std::move(role)); + new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs)); + new(&p.noun_phrase_.synrestrs) std::set(std::move(synrestrs)); + + return p; + } + + part part::createVerb() + { + return part(type::verb); + } + + part part::createPreposition(std::set choices, bool literal) + { + part p(type::preposition); + + new(&p.preposition_.choices) std::set(std::move(choices)); + p.preposition_.literal = literal; + + return p; + } + + part part::createAdjective() + { + return part(type::adjective); + } + + part part::createAdverb() + { + return part(type::adverb); + } + + part part::createLiteral(std::string value) + { + part p(type::literal); + + new(&p.literal_) std::string(std::move(value)); + + return p; + } + + part::part(const part& other) + { + type_ = other.type_; + + switch (type_) + { + case type::noun_phrase: + { + new(&noun_phrase_.role) std::string(other.noun_phrase_.role); + new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs); + new(&noun_phrase_.synrestrs) std::set(other.noun_phrase_.synrestrs); + + break; + } + + case type::preposition: + { + new(&preposition_.choices) std::set(other.preposition_.choices); + preposition_.literal = other.preposition_.literal; + + break; + } + + case type::literal: + { + new(&literal_) std::string(other.literal_); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + } + + part::part(part&& other) : part() + { + swap(*this, other); + } + + part& part::operator=(part other) + { + swap(*this, other); + + return *this; + } + + void swap(part& first, part& second) + { + using type = part::type; + + type tempType = first.type_; + std::string tempRole; + selrestr tempSelrestrs; + std::set tempSynrestrs; + std::set tempChoices; + bool tempPrepLiteral; + std::string tempLiteralValue; + + switch (tempType) + { + case type::noun_phrase: + { + tempRole = std::move(first.noun_phrase_.role); + tempSelrestrs = std::move(first.noun_phrase_.selrestrs); + tempSynrestrs = std::move(first.noun_phrase_.synrestrs); + + break; + } + + case type::preposition: + { + tempChoices = std::move(first.preposition_.choices); + tempPrepLiteral = first.preposition_.literal; + + break; + } + + case type::literal: + { + tempLiteralValue = std::move(first.literal_); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + + first.~part(); + + first.type_ = second.type_; + + switch (first.type_) + { + case type::noun_phrase: + { + new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role)); + new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs)); + new(&first.noun_phrase_.synrestrs) std::set(std::move(second.noun_phrase_.synrestrs)); + + break; + } + + case type::preposition: + { + new(&first.preposition_.choices) std::set(std::move(second.preposition_.choices)); + first.preposition_.literal = second.preposition_.literal; + + break; + } + + case type::literal: + { + new(&first.literal_) std::string(std::move(second.literal_)); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + + second.~part(); + + second.type_ = tempType; + + switch (second.type_) + { + case type::noun_phrase: + { + new(&second.noun_phrase_.role) std::string(std::move(tempRole)); + new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs)); + new(&second.noun_phrase_.synrestrs) std::set(std::move(tempSynrestrs)); + + break; + } + + case type::preposition: + { + new(&second.preposition_.choices) std::set(std::move(tempChoices)); + second.preposition_.literal = tempPrepLiteral; + + break; + } + + case type::literal: + { + new(&second.literal_) std::string(std::move(tempLiteralValue)); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + } + + part::~part() + { + switch (type_) + { + case type::noun_phrase: + { + using string_type = std::string; + using set_type = std::set; + + noun_phrase_.role.~string_type(); + noun_phrase_.selrestrs.~selrestr(); + noun_phrase_.synrestrs.~set_type(); + + break; + } + + case type::preposition: + { + using set_type = std::set; + + preposition_.choices.~set_type(); + + break; + } + + case type::literal: + { + using string_type = std::string; + + literal_.~string_type(); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + } + + std::string part::getNounRole() const + { + if (type_ == type::noun_phrase) + { + return noun_phrase_.role; + } else { + throw std::domain_error("part::getNounRole is only valid for noun phrase parts"); + } + } + + selrestr part::getNounSelrestrs() const + { + if (type_ == type::noun_phrase) + { + return noun_phrase_.selrestrs; + } else { + throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts"); + } + } + + std::set part::getNounSynrestrs() const + { + if (type_ == type::noun_phrase) + { + return noun_phrase_.synrestrs; + } else { + throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts"); + } + } + + std::set part::getPrepositionChoices() const + { + if (type_ == type::preposition) + { + return preposition_.choices; + } else { + throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts"); + } + } + + bool part::isPrepositionLiteral() const + { + if (type_ == type::preposition) + { + return preposition_.literal; + } else { + throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts"); + } + } + + std::string part::getLiteralValue() const + { + if (type_ == type::literal) + { + return literal_; + } else { + throw std::domain_error("part::getLiteralValue is only valid for literal parts"); + } + } + + }; +}; diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h @@ -0,0 +1,114 @@ +#ifndef PART_H_FB54F361 +#define PART_H_FB54F361 + +#include +#include +#include "selrestr.h" + +namespace verbly { + namespace generator { + + class part { + public: + enum class type { + invalid = -1, + noun_phrase = 0, + verb = 1, + preposition = 2, + adjective = 3, + adverb = 4, + literal = 5 + }; + + // Static factories + + static part createNounPhrase(std::string role, selrestr selrestrs, std::set synrestrs); + + static part createVerb(); + + static part createPreposition(std::set choices, bool literal); + + static part createAdjective(); + + static part createAdverb(); + + static part createLiteral(std::string value); + + // Copy and move constructors + + part(const part& other); + + part(part&& other); + + // Assignment + + part& operator=(part other); + + // Swap + + friend void swap(part& first, part& second); + + // Destructor + + ~part(); + + // General accessors + + type getType() const + { + return type_; + } + + // Noun phrase accessors + + std::string getNounRole() const; + + selrestr getNounSelrestrs() const; + + std::set getNounSynrestrs() const; + + // Preposition accessors + + std::set getPrepositionChoices() const; + + bool isPrepositionLiteral() const; + + // Literal accessors + + std::string getLiteralValue() const; + + private: + + // Private constructors + + part() + { + } + + part(type t) : type_(t) + { + } + + // Data + + union { + struct { + std::string role; + selrestr selrestrs; + std::set synrestrs; + } noun_phrase_; + struct { + std::set choices; + bool literal; + } preposition_; + std::string literal_; + }; + + type type_ = type::invalid; + + }; + + }; +}; + +#endif /* end of include guard: PART_H_FB54F361 */ diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h @@ -3,48 +3,54 @@ #include -class progress { - private: - std::string message; - int total; - int cur = 0; - int lprint = 0; +namespace verbly { + namespace generator { - public: - progress(std::string message, int total) : message(message), total(total) - { - std::cout << message << " 0%" << std::flush; - } + class progress { + private: + std::string message; + int total; + int cur = 0; + int lprint = 0; - void update(int val) - { - if (val <= total) - { - cur = val; - } else { - cur = total; - } + public: + progress(std::string message, int total) : message(message), total(total) + { + std::cout << message << " 0%" << std::flush; + } + + void update(int val) + { + if (val <= total) + { + cur = val; + } else { + cur = total; + } - int pp = cur * 100 / total; - if (pp != lprint) - { - lprint = pp; + int pp = cur * 100 / total; + if (pp != lprint) + { + lprint = pp; - std::cout << "\b\b\b\b" << std::right; - std::cout.width(3); - std::cout << pp << "%" << std::flush; - } - } + std::cout << "\b\b\b\b" << std::right; + std::cout.width(3); + std::cout << pp << "%" << std::flush; + } + } + + void update() + { + update(cur+1); + } - void update() - { - update(cur+1); - } + ~progress() + { + std::cout << "\b\b\b\b100%" << std::endl; + } + }; - ~progress() - { - std::cout << "\b\b\b\b100%" << std::endl; - } + }; }; #endif /* end of include guard: PROGRESS_H_A34EF856 */ diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp @@ -0,0 +1,87 @@ +#include "pronunciation.h" +#include +#include +#include +#include +#include "database.h" +#include "field.h" +#include "../lib/util.h" + +namespace verbly { + namespace generator { + + int pronunciation::nextId_ = 0; + + pronunciation::pronunciation(std::string phonemes) : + id_(nextId_++), + phonemes_(phonemes) + { + auto phonemeList = split>(phonemes, " "); + + auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) { + return phoneme.find("1") != std::string::npos; + }); + + // Rhyme detection + if (rhymeStart != std::end(phonemeList)) + { + std::list rhymePhonemes; + + std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) { + std::string naked; + + std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) { + return std::isdigit(ch); + }); + + return naked; + }); + + rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); + + if (rhymeStart != std::begin(phonemeList)) + { + prerhyme_ = *std::prev(rhymeStart); + } + } + + // Syllable/stress + for (std::string phoneme : phonemeList) + { + if (std::isdigit(phoneme.back())) + { + // It's a vowel! + syllables_++; + + if (phoneme.back() == '1') + { + stress_.push_back('1'); + } else { + stress_.push_back('0'); + } + } + } + } + + database& operator<<(database& db, const pronunciation& arg) + { + std::list fields; + + fields.emplace_back("pronunciation_id", arg.getId()); + fields.emplace_back("phonemes", arg.getPhonemes()); + fields.emplace_back("syllables", arg.getSyllables()); + fields.emplace_back("stress", arg.getStress()); + + if (arg.hasRhyme()) + { + fields.emplace_back("rhyme", arg.getRhymePhonemes()); + fields.emplace_back("prerhyme", arg.getPrerhyme()); + } + + db.insertIntoTable("pronunciations", std::move(fields)); + + return db; + } + + }; +}; diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h @@ -0,0 +1,82 @@ +#ifndef PRONUNCIATION_H_584A08DD +#define PRONUNCIATION_H_584A08DD + +#include +#include + +namespace verbly { + namespace generator { + + class database; + + class pronunciation { + public: + + // Constructor + + explicit pronunciation(std::string phonemes); + + // Accessors + + int getId() const + { + return id_; + } + + std::string getPhonemes() const + { + return phonemes_; + } + + bool hasRhyme() const + { + return !rhyme_.empty(); + } + + std::string getRhymePhonemes() const + { + // Calling code should always call hasRhyme first. + assert(!rhyme_.empty()); + + return rhyme_; + } + + std::string getPrerhyme() const + { + // Calling code should always call hasRhyme first. + assert(!rhyme_.empty()); + + return prerhyme_; + } + + int getSyllables() const + { + return syllables_; + } + + std::string getStress() const + { + return stress_; + } + + private: + + static int nextId_; + + const int id_; + const std::string phonemes_; + std::string rhyme_; + std::string prerhyme_; + int syllables_ = 0; + std::string stress_; + + }; + + // Serializer + + database& operator<<(database& db, const pronunciation& arg); + + }; +}; + +#endif /* end of include guard: PRONUNCIATION_H_584A08DD */ diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h @@ -0,0 +1,35 @@ +#ifndef ROLE_H_249F9A9C +#define ROLE_H_249F9A9C + +#include "selrestr.h" + +namespace verbly { + namespace generator { + + class role { + public: + + // Mutators + + void setSelrestrs(selrestr selrestrs) + { + selrestrs_ = selrestrs; + } + + // Accessors + + const selrestr& getSelrestrs() const + { + return selrestrs_; + } + + private: + + selrestr selrestrs_; + + }; + + }; +}; + +#endif /* end of include guard: ROLE_H_249F9A9C */ diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql @@ -1,286 +1,204 @@ -DROP TABLE IF EXISTS `verbs`; -CREATE TABLE `verbs` ( - `verb_id` INTEGER PRIMARY KEY, - `infinitive` VARCHAR(32) NOT NULL, - `past_tense` VARCHAR(32) NOT NULL, - `past_participle` VARCHAR(32) NOT NULL, - `ing_form` VARCHAR(32) NOT NULL, - `s_form` VARCHAR(32) NOT NULL +CREATE TABLE `notions` ( + `notion_id` INTEGER PRIMARY KEY, + `part_of_speech` SMALLINT NOT NULL, + `wnid` INTEGER, + `images` INTEGER ); -DROP TABLE IF EXISTS `groups`; -CREATE TABLE `groups` ( - `group_id` INTEGER PRIMARY KEY, - `data` BLOB NOT NULL -); - -DROP TABLE IF EXISTS `frames`; -CREATE TABLE `frames` ( - `frame_id` INTEGER PRIMARY KEY, - `group_id` INTEGER NOT NULL, - `data` BLOB NOT NULL, - FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) -); +CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`); -DROP TABLE IF EXISTS `verb_groups`; -CREATE TABLE `verb_groups` ( - `verb_id` INTEGER NOT NULL, - `group_id` INTEGER NOT NULL, - FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`), - FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) -); - -DROP TABLE IF EXISTS `adjectives`; -CREATE TABLE `adjectives` ( - `adjective_id` INTEGER PRIMARY KEY, - `base_form` VARCHAR(32) NOT NULL, - `comparative` VARCHAR(32), - `superlative` VARCHAR(32), - `position` CHAR(1), - `complexity` INTEGER NOT NULL -); - -DROP TABLE IF EXISTS `adverbs`; -CREATE TABLE `adverbs` ( - `adverb_id` INTEGER PRIMARY KEY, - `base_form` VARCHAR(32) NOT NULL, - `comparative` VARCHAR(32), - `superlative` VARCHAR(32), - `complexity` INTEGER NOT NULL -); - -DROP TABLE IF EXISTS `nouns`; -CREATE TABLE `nouns` ( - `noun_id` INTEGER PRIMARY KEY, - `singular` VARCHAR(32) NOT NULL, - `plural` VARCHAR(32), - `proper` INTEGER(1) NOT NULL, - `complexity` INTEGER NOT NULL, - `images` INTEGER NOT NULL, - `wnid` INTEGER NOT NULL -); - -DROP TABLE IF EXISTS `hypernymy`; CREATE TABLE `hypernymy` ( `hypernym_id` INTEGER NOT NULL, - `hyponym_id` INTEGER NOT NULL, - FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`) + `hyponym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `instantiation`; +CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`); +CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`); + CREATE TABLE `instantiation` ( `class_id` INTEGER NOT NULL, - `instance_id` INTEGER NOT NULL, - FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`) + `instance_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `member_meronymy`; +CREATE INDEX `instance_of` ON `instantiation`(`class_id`); +CREATE INDEX `class_of` ON `instantiation`(`instance_id`); + CREATE TABLE `member_meronymy` ( `meronym_id` INTEGER NOT NULL, - `holonym_id` INTEGER NOT NULL, - FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) + `holonym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `part_meronymy`; +CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`); +CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`); + CREATE TABLE `part_meronymy` ( `meronym_id` INTEGER NOT NULL, - `holonym_id` INTEGER NOT NULL, - FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) + `holonym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `substance_meronymy`; +CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`); +CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`); + CREATE TABLE `substance_meronymy` ( `meronym_id` INTEGER NOT NULL, - `holonym_id` INTEGER NOT NULL, - FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) + `holonym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `variation`; +CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`); +CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`); + CREATE TABLE `variation` ( `noun_id` INTEGER NOT NULL, - `adjective_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) + `adjective_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `noun_antonymy`; -CREATE TABLE `noun_antonymy` ( - `noun_1_id` INTEGER NOT NULL, - `noun_2_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) -); +CREATE INDEX `variant_of` ON `variation`(`noun_id`); +CREATE INDEX `attribute_of` ON `variation`(`adjective_id`); -DROP TABLE IF EXISTS `adjective_antonymy`; -CREATE TABLE `adjective_antonymy` ( +CREATE TABLE `similarity` ( `adjective_1_id` INTEGER NOT NULL, - `adjective_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) + `adjective_2_id` INTEGER NOT NULL +); + +CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`); + +CREATE TABLE `is_a` ( + `notion_id` INTEGER NOT NULL, + `groupname` VARCHAR(32) NOT NULL ); -DROP TABLE IF EXISTS `adverb_antonymy`; -CREATE TABLE `adverb_antonymy` ( - `adverb_1_id` INTEGER NOT NULL, - `adverb_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), - FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) +CREATE TABLE `entailment` ( + `given_id` INTEGER NOT NULL, + `entailment_id` INTEGER NOT NULL +); + +CREATE INDEX `entailment_of` ON `entailment`(`given_id`); +CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`); + +CREATE TABLE `causality` ( + `cause_id` INTEGER NOT NULL, + `effect_id` INTEGER NOT NULL +); + +CREATE INDEX `effect_of` ON `causality`(`cause_id`); +CREATE INDEX `cause_of` ON `causality`(`effect_id`); + +CREATE TABLE `words` ( + `word_id` INTEGER PRIMARY KEY, + `notion_id` INTEGER NOT NULL, + `lemma_id` INTEGER NOT NULL, + `tag_count` INTEGER, + `position` SMALLINT, + `group_id` INTEGER +); + +CREATE INDEX `notion_words` ON `words`(`notion_id`); +CREATE INDEX `lemma_words` ON `words`(`lemma_id`); +CREATE INDEX `group_words` ON `words`(`group_id`); + +CREATE TABLE `antonymy` ( + `antonym_1_id` INTEGER NOT NULL, + `antonym_2_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `specification`; +CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`); + CREATE TABLE `specification` ( `general_id` INTEGER NOT NULL, - `specific_id` INTEGER NOT NULL, - FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`) + `specific_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `pertainymy`; +CREATE INDEX `specification_of` ON `specification`(`general_id`); +CREATE INDEX `generalization_of` ON `specification`(`specific_id`); + CREATE TABLE `pertainymy` ( `noun_id` INTEGER NOT NULL, - `pertainym_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`) + `pertainym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `mannernymy`; +CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`); +CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`); + CREATE TABLE `mannernymy` ( `adjective_id` INTEGER NOT NULL, - `mannernym_id` INTEGER NOT NULL, - FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`) + `mannernym_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `noun_synonymy`; -CREATE TABLE `noun_synonymy` ( - `noun_1_id` INTEGER NOT NULL, - `noun_2_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`), - FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`) -); +CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`); +CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`); -DROP TABLE IF EXISTS `adjective_synonymy`; -CREATE TABLE `adjective_synonymy` ( - `adjective_1_id` INTEGER NOT NULL, - `adjective_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) +CREATE TABLE `usage` ( + `domain_id` INTEGER NOT NULL, + `term_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `adverb_synonymy`; -CREATE TABLE `adverb_synonymy` ( - `adverb_1_id` INTEGER NOT NULL, - `adverb_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), - FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) -); +CREATE INDEX `usage_term_of` ON `usage`(`domain_id`); +CREATE INDEX `usage_domain_of` ON `usage`(`term_id`); -DROP TABLE IF EXISTS `noun_pronunciations`; -CREATE TABLE `noun_pronunciations` ( - `noun_id` INTEGER NOT NULL, - `pronunciation` VARCHAR(64) NOT NULL, - `prerhyme` VARCHAR(8), - `rhyme` VARCHAR(64), - `syllables` INT NOT NULL, - `stress` VARCHAR(64) NOT NULL, - FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`) +CREATE TABLE `topicality` ( + `domain_id` INTEGER NOT NULL, + `term_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `verb_pronunciations`; -CREATE TABLE `verb_pronunciations` ( - `verb_id` INTEGER NOT NULL, - `pronunciation` VARCHAR(64) NOT NULL, - `prerhyme` VARCHAR(8), - `rhyme` VARCHAR(64), - `syllables` INT NOT NULL, - `stress` VARCHAR(64) NOT NULL, - FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`) -); +CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`); +CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`); -DROP TABLE IF EXISTS `adjective_pronunciations`; -CREATE TABLE `adjective_pronunciations` ( - `adjective_id` INTEGER NOT NULL, - `pronunciation` VARCHAR(64) NOT NULL, - `prerhyme` VARCHAR(8), - `rhyme` VARCHAR(64), - `syllables` INT NOT NULL, - `stress` VARCHAR(64) NOT NULL, - FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) +CREATE TABLE `regionality` ( + `domain_id` INTEGER NOT NULL, + `term_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `adverb_pronunciations`; -CREATE TABLE `adverb_pronunciations` ( - `adverb_id` INTEGER NOT NULL, - `pronunciation` VARCHAR(64) NOT NULL, - `prerhyme` VARCHAR(8), - `rhyme` VARCHAR(64), - `syllables` INT NOT NULL, - `stress` VARCHAR(64) NOT NULL, - FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) -); +CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`); +CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`); -DROP TABLE IF EXISTS `noun_noun_derivation`; -CREATE TABLE `noun_noun_derivation` ( - `noun_1_id` INTEGER NOT NULL, - `noun_2_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) +CREATE TABLE `forms` ( + `form_id` INTEGER PRIMARY KEY, + `form` VARCHAR(32) NOT NULL, + `complexity` SMALLINT NOT NULL, + `proper` SMALLINT NOT NULL ); -DROP TABLE IF EXISTS `noun_adjective_derivation`; -CREATE TABLE `noun_adjective_derivation` ( - `noun_id` INTEGER NOT NULL, - `adjective_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) -); +CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); -DROP TABLE IF EXISTS `noun_adverb_derivation`; -CREATE TABLE `noun_adverb_derivation` ( - `noun_id` INTEGER NOT NULL, - `adverb_id` INTEGER NOT NULL, - FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), - FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) +CREATE TABLE `lemmas_forms` ( + `lemma_id` INTEGER NOT NULL, + `form_id` INTEGER NOT NULL, + `category` SMALLINT NOT NULL ); -DROP TABLE IF EXISTS `adjective_adjective_derivation`; -CREATE TABLE `adjective_adjective_derivation` ( - `adjective_1_id` INTEGER NOT NULL, - `adjective_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) +CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`); +CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`); + +CREATE TABLE `pronunciations` ( + `pronunciation_id` INTEGER PRIMARY KEY, + `phonemes` VARCHAR(64) NOT NULL, + `prerhyme` VARCHAR(8), + `rhyme` VARCHAR(64), + `syllables` INTEGER NOT NULL, + `stress` VARCHAR(64) NOT NULL ); -DROP TABLE IF EXISTS `adjective_adverb_derivation`; -CREATE TABLE `adjective_adverb_derivation` ( - `adjective_id` INTEGER NOT NULL, - `adverb_id` INTEGER NOT NULL, - FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), - FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`) +CREATE TABLE `forms_pronunciations` ( + `form_id` INTEGER NOT NULL, + `pronunciation_id` INTEGER NOT NULL ); -DROP TABLE IF EXISTS `adverb_adverb_derivation`; -CREATE TABLE `adverb_adverb_derivation` ( - `adverb_1_id` INTEGER NOT NULL, - `adverb_2_id` INTEGER NOT NULL, - FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), - FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) +CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`); +CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`); + +CREATE TABLE `groups` ( + `group_id` INTEGER PRIMARY KEY, + `data` BLOB NOT NULL ); -DROP TABLE IF EXISTS `prepositions`; -CREATE TABLE `prepositions` ( - `preposition_id` INTEGER PRIMARY KEY, - `form` VARCHAR(32) NOT NULL +CREATE TABLE `frames` ( + `frame_id` INTEGER PRIMARY KEY, + `data` BLOB NOT NULL ); -DROP TABLE IF EXISTS `preposition_groups`; -CREATE TABLE `preposition_groups` ( - `preposition_id` INTEGER NOT NULL, - `groupname` VARCHAR(32) NOT NULL, - FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`) +CREATE TABLE `groups_frames` ( + `group_id` INTEGER NOT NULL, + `frame_id` INTEGER NOT NULL ); + +CREATE INDEX `frames_in` ON `groups_frames`(`group_id`); diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp @@ -0,0 +1,288 @@ +#include "selrestr.h" + +namespace verbly { + namespace generator { + + selrestr::selrestr(const selrestr& other) + { + type_ = other.type_; + + switch (type_) + { + case type::singleton: + { + singleton_.pos = other.singleton_.pos; + new(&singleton_.restriction) std::string(other.singleton_.restriction); + + break; + } + + case type::group: + { + new(&group_.children) std::list(other.group_.children); + group_.orlogic = other.group_.orlogic; + + break; + } + + case type::empty: + { + break; + } + } + } + + selrestr::selrestr(selrestr&& other) : selrestr() + { + swap(*this, other); + } + + selrestr& selrestr::operator=(selrestr other) + { + swap(*this, other); + + return *this; + } + + void swap(selrestr& first, selrestr& second) + { + using type = selrestr::type; + + type tempType = first.type_; + int tempPos; + std::string tempRestriction; + std::list tempChildren; + bool tempOrlogic; + + switch (tempType) + { + case type::singleton: + { + tempPos = first.singleton_.pos; + tempRestriction = std::move(first.singleton_.restriction); + + break; + } + + case type::group: + { + tempChildren = std::move(first.group_.children); + tempOrlogic = first.group_.orlogic; + + break; + } + + case type::empty: + { + break; + } + } + + first.~selrestr(); + + first.type_ = second.type_; + + switch (first.type_) + { + case type::singleton: + { + first.singleton_.pos = second.singleton_.pos; + new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction)); + + break; + } + + case type::group: + { + new(&first.group_.children) std::list(std::move(second.group_.children)); + first.group_.orlogic = second.group_.orlogic; + + break; + } + + case type::empty: + { + break; + } + } + + second.~selrestr(); + + second.type_ = tempType; + + switch (second.type_) + { + case type::singleton: + { + second.singleton_.pos = tempPos; + new(&second.singleton_.restriction) std::string(std::move(tempRestriction)); + + break; + } + + case type::group: + { + new(&second.group_.children) std::list(std::move(tempChildren)); + second.group_.orlogic = tempOrlogic; + + break; + } + + case type::empty: + { + break; + } + } + } + + selrestr::~selrestr() + { + switch (type_) + { + case type::singleton: + { + using string_type = std::string; + singleton_.restriction.~string_type(); + + break; + } + + case type::group: + { + using list_type = std::list; + group_.children.~list_type(); + + break; + } + + case type::empty: + { + break; + } + } + } + + selrestr::selrestr() : type_(type::empty) + { + } + + selrestr::selrestr( + std::string restriction, + bool pos) : + type_(type::singleton) + { + new(&singleton_.restriction) std::string(std::move(restriction)); + singleton_.pos = pos; + } + + std::string selrestr::getRestriction() const + { + if (type_ == type::singleton) + { + return singleton_.restriction; + } else { + throw std::domain_error("Only singleton selrestrs have restrictions"); + } + } + + bool selrestr::getPos() const + { + if (type_ == type::singleton) + { + return singleton_.pos; + } else { + throw std::domain_error("Only singleton selrestrs have positivity flags"); + } + } + + selrestr::selrestr( + std::list children, + bool orlogic) : + type_(type::group) + { + new(&group_.children) std::list(std::move(children)); + group_.orlogic = orlogic; + } + + std::list selrestr::getChildren() const + { + if (type_ == type::group) + { + return group_.children; + } else { + throw std::domain_error("Only group selrestrs have children"); + } + } + + std::list::const_iterator selrestr::begin() const + { + if (type_ == type::group) + { + return std::begin(group_.children); + } else { + throw std::domain_error("Only group selrestrs have children"); + } + } + + std::list::const_iterator selrestr::end() const + { + if (type_ == type::group) + { + return std::end(group_.children); + } else { + throw std::domain_error("Only group selrestrs have children"); + } + } + + bool selrestr::getOrlogic() const + { + if (type_ == type::group) + { + return group_.orlogic; + } else { + throw std::domain_error("Only group selrestrs have logic"); + } + } + + nlohmann::json selrestr::toJson() const + { + switch (type_) + { + case type::empty: + { + return {}; + } + + case type::singleton: + { + return { + {"type", singleton_.restriction}, + {"pos", singleton_.pos} + }; + } + + case type::group: + { + std::string logic; + if (group_.orlogic) + { + logic = "or"; + } else { + logic = "and"; + } + + std::list children; + std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) { + return child.toJson(); + }); + + return { + {"logic", logic}, + {"children", children} + }; + } + } + } + + }; +}; diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h @@ -0,0 +1,88 @@ +#ifndef SELRESTR_H_50652FB7 +#define SELRESTR_H_50652FB7 + +#include +#include +#include + +namespace verbly { + namespace generator { + + class selrestr { + public: + enum class type { + empty, + singleton, + group + }; + + // Copy and move constructors + + selrestr(const selrestr& other); + selrestr(selrestr&& other); + + // Assignment + + selrestr& operator=(selrestr other); + + // Swap + + friend void swap(selrestr& first, selrestr& second); + + // Destructor + + ~selrestr(); + + // Generic accessors + + type getType() const + { + return type_; + } + + // Empty + + selrestr(); + + // Singleton + + selrestr(std::string restriction, bool pos); + + std::string getRestriction() const; + + bool getPos() const; + + // Group + + selrestr(std::list children, bool orlogic); + + std::list getChildren() const; + + std::list::const_iterator begin() const; + + std::list::const_iterator end() const; + + bool getOrlogic() const; + + // Helpers + + nlohmann::json toJson() const; + + private: + union { + struct { + bool pos; + std::string restriction; + } singleton_; + struct { + std::list children; + bool orlogic; + } group_; + }; + type type_; + }; + + }; +}; + +#endif /* end of include guard: SELRESTR_H_50652FB7 */ diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp @@ -0,0 +1,77 @@ +#include "word.h" +#include +#include +#include "database.h" +#include "notion.h" +#include "lemma.h" +#include "field.h" +#include "group.h" + +namespace verbly { + namespace generator { + + int word::nextId_ = 0; + + word::word( + notion& n, + lemma& l) : + id_(nextId_++), + notion_(n), + lemma_(l) + { + } + + word::word( + notion& n, + lemma& l, + int tagCount) : + id_(nextId_++), + notion_(n), + lemma_(l), + tagCount_(tagCount), + hasTagCount_(true) + { + } + + void word::setAdjectivePosition(positioning adjectivePosition) + { + adjectivePosition_ = adjectivePosition; + } + + void word::setVerbGroup(const group& verbGroup) + { + verbGroup_ = &verbGroup; + } + + database& operator<<(database& db, const word& arg) + { + std::list fields; + + fields.emplace_back("word_id", arg.getId()); + fields.emplace_back("notion_id", arg.getNotion().getId()); + fields.emplace_back("lemma_id", arg.getLemma().getId()); + + if (arg.hasTagCount()) + { + fields.emplace_back("tag_count", arg.getTagCount()); + } + + if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective) + && (arg.getAdjectivePosition() != positioning::undefined)) + { + fields.emplace_back("position", static_cast(arg.getAdjectivePosition())); + } + + if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb) + && (arg.hasVerbGroup())) + { + fields.emplace_back("group_id", arg.getVerbGroup().getId()); + } + + db.insertIntoTable("words", std::move(fields)); + + return db; + } + + }; +}; diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h @@ -0,0 +1,110 @@ +#ifndef WORD_H_91F99D46 +#define WORD_H_91F99D46 + +#include +#include "enums.h" + +namespace verbly { + namespace generator { + + class notion; + class lemma; + class database; + class group; + + class word { + public: + + // Constructors + + word(notion& n, lemma& l); + + word(notion& n, lemma& l, int tagCount); + + // Mutators + + void setAdjectivePosition(positioning adjectivePosition); + + void setVerbGroup(const group& verbGroup); + + // Accessors + + int getId() const + { + return id_; + } + + notion& getNotion() + { + return notion_; + } + + const notion& getNotion() const + { + return notion_; + } + + lemma& getLemma() + { + return lemma_; + } + + const lemma& getLemma() const + { + return lemma_; + } + + bool hasTagCount() const + { + return hasTagCount_; + } + + int getTagCount() const + { + // Calling code should always call hasTagCount first. + assert(hasTagCount_); + + return tagCount_; + } + + positioning getAdjectivePosition() const + { + return adjectivePosition_; + } + + bool hasVerbGroup() const + { + return (verbGroup_ != nullptr); + } + + const group& getVerbGroup() const + { + // Calling code should always call hasVerbGroup first. + assert(verbGroup_ != nullptr); + + return *verbGroup_; + } + + private: + + static int nextId_; + + const int id_; + notion& notion_; + lemma& lemma_; + const int tagCount_ = 0; + const bool hasTagCount_ = false; + + positioning adjectivePosition_ = positioning::undefined; + const group* verbGroup_ = nullptr; + + }; + + // Serializer + + database& operator<<(database& db, const word& arg); + + }; +}; + +#endif /* end of include guard: WORD_H_91F99D46 */ -- cgit 1.4.1