diff options
Diffstat (limited to 'generator')
| -rw-r--r-- | generator/CMakeLists.txt | 6 | ||||
| -rw-r--r-- | generator/database.cpp | 173 | ||||
| -rw-r--r-- | generator/database.h | 73 | ||||
| -rw-r--r-- | generator/field.cpp | 193 | ||||
| -rw-r--r-- | generator/field.h | 76 | ||||
| -rw-r--r-- | generator/form.cpp | 53 | ||||
| -rw-r--r-- | generator/form.h | 71 | ||||
| -rw-r--r-- | generator/frame.cpp | 83 | ||||
| -rw-r--r-- | generator/frame.h | 59 | ||||
| -rw-r--r-- | generator/generator.cpp | 3145 | ||||
| -rw-r--r-- | generator/generator.h | 151 | ||||
| -rw-r--r-- | generator/group.cpp | 119 | ||||
| -rw-r--r-- | generator/group.h | 80 | ||||
| -rw-r--r-- | generator/lemma.cpp | 65 | ||||
| -rw-r--r-- | generator/lemma.h | 58 | ||||
| -rw-r--r-- | generator/main.cpp | 40 | ||||
| -rw-r--r-- | generator/notion.cpp | 85 | ||||
| -rw-r--r-- | generator/notion.h | 91 | ||||
| -rw-r--r-- | generator/part.cpp | 336 | ||||
| -rw-r--r-- | generator/part.h | 114 | ||||
| -rw-r--r-- | generator/progress.h | 78 | ||||
| -rw-r--r-- | generator/pronunciation.cpp | 87 | ||||
| -rw-r--r-- | generator/pronunciation.h | 82 | ||||
| -rw-r--r-- | generator/role.h | 35 | ||||
| -rw-r--r-- | generator/schema.sql | 352 | ||||
| -rw-r--r-- | generator/selrestr.cpp | 288 | ||||
| -rw-r--r-- | generator/selrestr.h | 88 | ||||
| -rw-r--r-- | generator/word.cpp | 77 | ||||
| -rw-r--r-- | generator/word.h | 110 |
29 files changed, 4018 insertions, 2250 deletions
| diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | cmake_minimum_required (VERSION 2.6) | 1 | cmake_minimum_required (VERSION 3.1) |
| 2 | project (generator) | 2 | project (generator) |
| 3 | 3 | ||
| 4 | find_package(PkgConfig) | 4 | find_package(PkgConfig) |
| 5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) | 5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) |
| 6 | find_package(libxml2 REQUIRED) | 6 | find_package(libxml2 REQUIRED) |
| 7 | 7 | ||
| 8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) | 8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json) |
| 9 | add_executable(generator generator.cpp) | 9 | add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp) |
| 10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) | 10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) |
| 11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) | 11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) |
| 12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) | 12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) |
| diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp | |||
| @@ -0,0 +1,173 @@ | |||
| 1 | #include "database.h" | ||
| 2 | #include <sqlite3.h> | ||
| 3 | #include <cassert> | ||
| 4 | #include <fstream> | ||
| 5 | #include <stdexcept> | ||
| 6 | #include <cstdio> | ||
| 7 | #include <sstream> | ||
| 8 | #include "field.h" | ||
| 9 | #include "../lib/util.h" | ||
| 10 | |||
| 11 | namespace verbly { | ||
| 12 | namespace generator { | ||
| 13 | |||
| 14 | sqlite3_error::sqlite3_error( | ||
| 15 | const std::string& what, | ||
| 16 | const std::string& db_err) : | ||
| 17 | what_(what + " (" + db_err + ")"), | ||
| 18 | db_err_(db_err) | ||
| 19 | { | ||
| 20 | } | ||
| 21 | |||
| 22 | const char* sqlite3_error::what() const noexcept | ||
| 23 | { | ||
| 24 | return what_.c_str(); | ||
| 25 | } | ||
| 26 | |||
| 27 | const char* sqlite3_error::db_err() const noexcept | ||
| 28 | { | ||
| 29 | return db_err_.c_str(); | ||
| 30 | } | ||
| 31 | |||
| 32 | database::database(std::string path) | ||
| 33 | { | ||
| 34 | // If there is already a file at this path, overwrite it. | ||
| 35 | if (std::ifstream(path)) | ||
| 36 | { | ||
| 37 | if (std::remove(path.c_str())) | ||
| 38 | { | ||
| 39 | throw std::logic_error("Could not overwrite file at path"); | ||
| 40 | } | ||
| 41 | } | ||
| 42 | |||
| 43 | if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
| 44 | { | ||
| 45 | // We still have to free the resources allocated. In the event that | ||
| 46 | // allocation failed, ppdb will be null and sqlite3_close_v2 will just | ||
| 47 | // ignore it. | ||
| 48 | std::string errmsg(sqlite3_errmsg(ppdb_)); | ||
| 49 | sqlite3_close_v2(ppdb_); | ||
| 50 | |||
| 51 | throw sqlite3_error("Could not create output datafile", errmsg); | ||
| 52 | } | ||
| 53 | } | ||
| 54 | |||
| 55 | database::database(database&& other) : database() | ||
| 56 | { | ||
| 57 | swap(*this, other); | ||
| 58 | } | ||
| 59 | |||
| 60 | database& database::operator=(database&& other) | ||
| 61 | { | ||
| 62 | swap(*this, other); | ||
| 63 | |||
| 64 | return *this; | ||
| 65 | } | ||
| 66 | |||
| 67 | void swap(database& first, database& second) | ||
| 68 | { | ||
| 69 | std::swap(first.ppdb_, second.ppdb_); | ||
| 70 | } | ||
| 71 | |||
| 72 | database::~database() | ||
| 73 | { | ||
| 74 | sqlite3_close_v2(ppdb_); | ||
| 75 | } | ||
| 76 | |||
| 77 | void database::runQuery(std::string query) | ||
| 78 | { | ||
| 79 | // This can only happen when doing bad things with move semantics. | ||
| 80 | assert(ppdb_ != nullptr); | ||
| 81 | |||
| 82 | sqlite3_stmt* ppstmt; | ||
| 83 | |||
| 84 | if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 85 | { | ||
| 86 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
| 87 | } | ||
| 88 | |||
| 89 | int result = sqlite3_step(ppstmt); | ||
| 90 | sqlite3_finalize(ppstmt); | ||
| 91 | |||
| 92 | if (result != SQLITE_DONE) | ||
| 93 | { | ||
| 94 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
| 95 | } | ||
| 96 | } | ||
| 97 | |||
| 98 | void database::insertIntoTable(std::string table, std::list<field> fields) | ||
| 99 | { | ||
| 100 | // This can only happen when doing bad things with move semantics. | ||
| 101 | assert(ppdb_ != nullptr); | ||
| 102 | |||
| 103 | // This shouldn't happen. | ||
| 104 | assert(!fields.empty()); | ||
| 105 | |||
| 106 | std::list<std::string> fieldNames; | ||
| 107 | std::list<std::string> qs; | ||
| 108 | for (field& f : fields) | ||
| 109 | { | ||
| 110 | fieldNames.push_back(f.getName()); | ||
| 111 | qs.push_back("?"); | ||
| 112 | } | ||
| 113 | |||
| 114 | std::ostringstream query; | ||
| 115 | query << "INSERT INTO "; | ||
| 116 | query << table; | ||
| 117 | query << " ("; | ||
| 118 | query << implode(std::begin(fieldNames), std::end(fieldNames), ", "); | ||
| 119 | query << ") VALUES ("; | ||
| 120 | query << implode(std::begin(qs), std::end(qs), ", "); | ||
| 121 | query << ")"; | ||
| 122 | |||
| 123 | std::string query_str = query.str(); | ||
| 124 | |||
| 125 | sqlite3_stmt* ppstmt; | ||
| 126 | |||
| 127 | if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 128 | { | ||
| 129 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
| 130 | } | ||
| 131 | |||
| 132 | int i = 1; | ||
| 133 | for (field& f : fields) | ||
| 134 | { | ||
| 135 | switch (f.getType()) | ||
| 136 | { | ||
| 137 | case field::type::integer: | ||
| 138 | { | ||
| 139 | sqlite3_bind_int(ppstmt, i, f.getInteger()); | ||
| 140 | |||
| 141 | break; | ||
| 142 | } | ||
| 143 | |||
| 144 | case field::type::string: | ||
| 145 | { | ||
| 146 | sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT); | ||
| 147 | |||
| 148 | break; | ||
| 149 | } | ||
| 150 | |||
| 151 | case field::type::invalid: | ||
| 152 | { | ||
| 153 | // Fields can only be invalid when doing bad things with move semantics. | ||
| 154 | assert(false); | ||
| 155 | |||
| 156 | break; | ||
| 157 | } | ||
| 158 | } | ||
| 159 | |||
| 160 | i++; | ||
| 161 | } | ||
| 162 | |||
| 163 | int result = sqlite3_step(ppstmt); | ||
| 164 | sqlite3_finalize(ppstmt); | ||
| 165 | |||
| 166 | if (result != SQLITE_DONE) | ||
| 167 | { | ||
| 168 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | }; | ||
| 173 | }; | ||
| diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h | |||
| @@ -0,0 +1,73 @@ | |||
| 1 | #ifndef DATABASE_H_0B0A47D2 | ||
| 2 | #define DATABASE_H_0B0A47D2 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <exception> | ||
| 6 | #include <list> | ||
| 7 | |||
| 8 | struct sqlite3; | ||
| 9 | |||
| 10 | namespace verbly { | ||
| 11 | namespace generator { | ||
| 12 | |||
| 13 | class field; | ||
| 14 | |||
| 15 | class sqlite3_error : public std::exception { | ||
| 16 | public: | ||
| 17 | |||
| 18 | sqlite3_error(const std::string& what, const std::string& db_err); | ||
| 19 | |||
| 20 | const char* what() const noexcept override; | ||
| 21 | const char* db_err() const noexcept; | ||
| 22 | |||
| 23 | private: | ||
| 24 | std::string what_; | ||
| 25 | std::string db_err_; | ||
| 26 | |||
| 27 | }; | ||
| 28 | |||
| 29 | class database { | ||
| 30 | public: | ||
| 31 | |||
| 32 | // Constructor | ||
| 33 | |||
| 34 | explicit database(std::string path); | ||
| 35 | |||
| 36 | // Disable copying | ||
| 37 | |||
| 38 | database(const database& other) = delete; | ||
| 39 | database& operator=(const database& other) = delete; | ||
| 40 | |||
| 41 | // Move constructor and move assignment | ||
| 42 | |||
| 43 | database(database&& other); | ||
| 44 | database& operator=(database&& other); | ||
| 45 | |||
| 46 | // Swap | ||
| 47 | |||
| 48 | friend void swap(database& first, database& second); | ||
| 49 | |||
| 50 | // Destructor | ||
| 51 | |||
| 52 | ~database(); | ||
| 53 | |||
| 54 | // Actions | ||
| 55 | |||
| 56 | void runQuery(std::string query); | ||
| 57 | |||
| 58 | void insertIntoTable(std::string table, std::list<field> fields); | ||
| 59 | |||
| 60 | private: | ||
| 61 | |||
| 62 | database() | ||
| 63 | { | ||
| 64 | } | ||
| 65 | |||
| 66 | sqlite3* ppdb_ = nullptr; | ||
| 67 | |||
| 68 | }; | ||
| 69 | |||
| 70 | }; | ||
| 71 | }; | ||
| 72 | |||
| 73 | #endif /* end of include guard: DATABASE_H_0B0A47D2 */ | ||
| diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp | |||
| @@ -0,0 +1,193 @@ | |||
| 1 | #include "field.h" | ||
| 2 | #include <stdexcept> | ||
| 3 | #include <utility> | ||
| 4 | |||
| 5 | namespace verbly { | ||
| 6 | namespace generator { | ||
| 7 | |||
| 8 | field::field(const field& other) | ||
| 9 | { | ||
| 10 | type_ = other.type_; | ||
| 11 | name_ = other.name_; | ||
| 12 | |||
| 13 | switch (type_) | ||
| 14 | { | ||
| 15 | case type::integer: | ||
| 16 | { | ||
| 17 | integer_ = other.integer_; | ||
| 18 | |||
| 19 | break; | ||
| 20 | } | ||
| 21 | |||
| 22 | case type::string: | ||
| 23 | { | ||
| 24 | new(&string_) std::string(other.string_); | ||
| 25 | |||
| 26 | break; | ||
| 27 | } | ||
| 28 | |||
| 29 | case type::invalid: | ||
| 30 | { | ||
| 31 | break; | ||
| 32 | } | ||
| 33 | } | ||
| 34 | } | ||
| 35 | |||
| 36 | field::field(field&& other) : field() | ||
| 37 | { | ||
| 38 | swap(*this, other); | ||
| 39 | } | ||
| 40 | |||
| 41 | field& field::operator=(field other) | ||
| 42 | { | ||
| 43 | swap(*this, other); | ||
| 44 | |||
| 45 | return *this; | ||
| 46 | } | ||
| 47 | |||
| 48 | void swap(field& first, field& second) | ||
| 49 | { | ||
| 50 | using type = field::type; | ||
| 51 | |||
| 52 | type tempType = first.type_; | ||
| 53 | std::string tempName = std::move(first.name_); | ||
| 54 | int tempInteger; | ||
| 55 | std::string tempString; | ||
| 56 | |||
| 57 | switch (first.type_) | ||
| 58 | { | ||
| 59 | case type::integer: | ||
| 60 | { | ||
| 61 | tempInteger = first.integer_; | ||
| 62 | |||
| 63 | break; | ||
| 64 | } | ||
| 65 | |||
| 66 | case type::string: | ||
| 67 | { | ||
| 68 | tempString = std::move(tempString); | ||
| 69 | |||
| 70 | break; | ||
| 71 | } | ||
| 72 | |||
| 73 | case type::invalid: | ||
| 74 | { | ||
| 75 | break; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | |||
| 79 | first.~field(); | ||
| 80 | |||
| 81 | first.type_ = second.type_; | ||
| 82 | first.name_ = std::move(second.name_); | ||
| 83 | |||
| 84 | switch (second.type_) | ||
| 85 | { | ||
| 86 | case type::integer: | ||
| 87 | { | ||
| 88 | first.integer_ = second.integer_; | ||
| 89 | |||
| 90 | break; | ||
| 91 | } | ||
| 92 | |||
| 93 | case type::string: | ||
| 94 | { | ||
| 95 | new(&first.string_) std::string(std::move(second.string_)); | ||
| 96 | |||
| 97 | break; | ||
| 98 | } | ||
| 99 | |||
| 100 | case type::invalid: | ||
| 101 | { | ||
| 102 | break; | ||
| 103 | } | ||
| 104 | } | ||
| 105 | |||
| 106 | second.~field(); | ||
| 107 | |||
| 108 | second.type_ = tempType; | ||
| 109 | second.name_ = std::move(tempName); | ||
| 110 | |||
| 111 | switch (tempType) | ||
| 112 | { | ||
| 113 | case type::integer: | ||
| 114 | { | ||
| 115 | second.integer_ = tempInteger; | ||
| 116 | |||
| 117 | break; | ||
| 118 | } | ||
| 119 | |||
| 120 | case type::string: | ||
| 121 | { | ||
| 122 | new(&second.string_) std::string(std::move(tempString)); | ||
| 123 | |||
| 124 | break; | ||
| 125 | } | ||
| 126 | |||
| 127 | case type::invalid: | ||
| 128 | { | ||
| 129 | break; | ||
| 130 | } | ||
| 131 | } | ||
| 132 | } | ||
| 133 | |||
| 134 | field::~field() | ||
| 135 | { | ||
| 136 | switch (type_) | ||
| 137 | { | ||
| 138 | case type::string: | ||
| 139 | { | ||
| 140 | using string_type = std::string; | ||
| 141 | string_.~string_type(); | ||
| 142 | |||
| 143 | break; | ||
| 144 | } | ||
| 145 | |||
| 146 | case type::integer: | ||
| 147 | case type::invalid: | ||
| 148 | { | ||
| 149 | break; | ||
| 150 | } | ||
| 151 | } | ||
| 152 | } | ||
| 153 | |||
| 154 | field::field( | ||
| 155 | std::string name, | ||
| 156 | int arg) : | ||
| 157 | type_(type::integer), | ||
| 158 | name_(name), | ||
| 159 | integer_(arg) | ||
| 160 | { | ||
| 161 | } | ||
| 162 | |||
| 163 | int field::getInteger() const | ||
| 164 | { | ||
| 165 | if (type_ != type::integer) | ||
| 166 | { | ||
| 167 | throw std::domain_error("field::getInteger called on non-integer field"); | ||
| 168 | } | ||
| 169 | |||
| 170 | return integer_; | ||
| 171 | } | ||
| 172 | |||
| 173 | field::field( | ||
| 174 | std::string name, | ||
| 175 | std::string arg) : | ||
| 176 | type_(type::string), | ||
| 177 | name_(name) | ||
| 178 | { | ||
| 179 | new(&string_) std::string(arg); | ||
| 180 | } | ||
| 181 | |||
| 182 | std::string field::getString() const | ||
| 183 | { | ||
| 184 | if (type_ != type::string) | ||
| 185 | { | ||
| 186 | throw std::domain_error("field::getString called on non-string field"); | ||
| 187 | } | ||
| 188 | |||
| 189 | return string_; | ||
| 190 | } | ||
| 191 | |||
| 192 | }; | ||
| 193 | }; | ||
| diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h | |||
| @@ -0,0 +1,76 @@ | |||
| 1 | #ifndef BINDING_H_CAE0B18E | ||
| 2 | #define BINDING_H_CAE0B18E | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | |||
| 6 | namespace verbly { | ||
| 7 | namespace generator { | ||
| 8 | |||
| 9 | class field { | ||
| 10 | public: | ||
| 11 | enum class type { | ||
| 12 | invalid, | ||
| 13 | integer, | ||
| 14 | string | ||
| 15 | }; | ||
| 16 | |||
| 17 | // Copy and move constructors | ||
| 18 | |||
| 19 | field(const field& other); | ||
| 20 | field(field&& other); | ||
| 21 | |||
| 22 | // Assignment | ||
| 23 | |||
| 24 | field& operator=(field other); | ||
| 25 | |||
| 26 | // Swap | ||
| 27 | |||
| 28 | friend void swap(field& first, field& second); | ||
| 29 | |||
| 30 | // Destructor | ||
| 31 | |||
| 32 | ~field(); | ||
| 33 | |||
| 34 | // Generic accessors | ||
| 35 | |||
| 36 | type getType() const | ||
| 37 | { | ||
| 38 | return type_; | ||
| 39 | } | ||
| 40 | |||
| 41 | std::string getName() const | ||
| 42 | { | ||
| 43 | return name_; | ||
| 44 | } | ||
| 45 | |||
| 46 | // Integer | ||
| 47 | |||
| 48 | field(std::string name, int arg); | ||
| 49 | |||
| 50 | int getInteger() const; | ||
| 51 | |||
| 52 | // String | ||
| 53 | |||
| 54 | field(std::string name, std::string arg); | ||
| 55 | |||
| 56 | std::string getString() const; | ||
| 57 | |||
| 58 | private: | ||
| 59 | |||
| 60 | field() | ||
| 61 | { | ||
| 62 | } | ||
| 63 | |||
| 64 | union { | ||
| 65 | int integer_; | ||
| 66 | std::string string_; | ||
| 67 | }; | ||
| 68 | |||
| 69 | type type_ = type::invalid; | ||
| 70 | std::string name_; | ||
| 71 | }; | ||
| 72 | |||
| 73 | }; | ||
| 74 | }; | ||
| 75 | |||
| 76 | #endif /* end of include guard: BINDING_H_CAE0B18E */ | ||
| diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp | |||
| @@ -0,0 +1,53 @@ | |||
| 1 | #include "form.h" | ||
| 2 | #include <algorithm> | ||
| 3 | #include <list> | ||
| 4 | #include "database.h" | ||
| 5 | #include "field.h" | ||
| 6 | #include "pronunciation.h" | ||
| 7 | |||
| 8 | namespace verbly { | ||
| 9 | namespace generator { | ||
| 10 | |||
| 11 | int form::nextId_ = 0; | ||
| 12 | |||
| 13 | form::form(std::string text) : | ||
| 14 | id_(nextId_++), | ||
| 15 | text_(text), | ||
| 16 | complexity_(std::count(std::begin(text), std::end(text), ' ') + 1), | ||
| 17 | proper_(std::any_of(std::begin(text), std::end(text), std::isupper)) | ||
| 18 | { | ||
| 19 | } | ||
| 20 | |||
| 21 | void form::addPronunciation(const pronunciation& p) | ||
| 22 | { | ||
| 23 | pronunciations_.insert(&p); | ||
| 24 | } | ||
| 25 | |||
| 26 | database& operator<<(database& db, const form& arg) | ||
| 27 | { | ||
| 28 | // Serialize the form first. | ||
| 29 | { | ||
| 30 | std::list<field> fields; | ||
| 31 | fields.emplace_back("form_id", arg.getId()); | ||
| 32 | fields.emplace_back("form", arg.getText()); | ||
| 33 | fields.emplace_back("complexity", arg.getComplexity()); | ||
| 34 | fields.emplace_back("proper", arg.isProper()); | ||
| 35 | |||
| 36 | db.insertIntoTable("forms", std::move(fields)); | ||
| 37 | } | ||
| 38 | |||
| 39 | // Then, serialize the form/pronunciation relationship. | ||
| 40 | for (const pronunciation* p : arg.getPronunciations()) | ||
| 41 | { | ||
| 42 | std::list<field> fields; | ||
| 43 | fields.emplace_back("form_id", arg.getId()); | ||
| 44 | fields.emplace_back("pronunciation_id", p->getId()); | ||
| 45 | |||
| 46 | db.insertIntoTable("forms_pronunciations", std::move(fields)); | ||
| 47 | } | ||
| 48 | |||
| 49 | return db; | ||
| 50 | } | ||
| 51 | |||
| 52 | }; | ||
| 53 | }; | ||
| diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h | |||
| @@ -0,0 +1,71 @@ | |||
| 1 | #ifndef FORM_H_7EFBC970 | ||
| 2 | #define FORM_H_7EFBC970 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <set> | ||
| 6 | |||
| 7 | namespace verbly { | ||
| 8 | namespace generator { | ||
| 9 | |||
| 10 | class pronunciation; | ||
| 11 | class database; | ||
| 12 | |||
| 13 | class form { | ||
| 14 | public: | ||
| 15 | |||
| 16 | // Constructor | ||
| 17 | |||
| 18 | explicit form(std::string text); | ||
| 19 | |||
| 20 | // Mutators | ||
| 21 | |||
| 22 | void addPronunciation(const pronunciation& p); | ||
| 23 | |||
| 24 | // Accessors | ||
| 25 | |||
| 26 | int getId() const | ||
| 27 | { | ||
| 28 | return id_; | ||
| 29 | } | ||
| 30 | |||
| 31 | std::string getText() const | ||
| 32 | { | ||
| 33 | return text_; | ||
| 34 | } | ||
| 35 | |||
| 36 | int getComplexity() const | ||
| 37 | { | ||
| 38 | return complexity_; | ||
| 39 | } | ||
| 40 | |||
| 41 | bool isProper() const | ||
| 42 | { | ||
| 43 | return proper_; | ||
| 44 | } | ||
| 45 | |||
| 46 | std::set<const pronunciation*> getPronunciations() const | ||
| 47 | { | ||
| 48 | return pronunciations_; | ||
| 49 | } | ||
| 50 | |||
| 51 | private: | ||
| 52 | |||
| 53 | static int nextId_; | ||
| 54 | |||
| 55 | const int id_; | ||
| 56 | const std::string text_; | ||
| 57 | const int complexity_; | ||
| 58 | const bool proper_; | ||
| 59 | |||
| 60 | std::set<const pronunciation*> pronunciations_; | ||
| 61 | |||
| 62 | }; | ||
| 63 | |||
| 64 | // Serializer | ||
| 65 | |||
| 66 | database& operator<<(database& db, const form& arg); | ||
| 67 | |||
| 68 | }; | ||
| 69 | }; | ||
| 70 | |||
| 71 | #endif /* end of include guard: FORM_H_7EFBC970 */ | ||
| diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp | |||
| @@ -0,0 +1,83 @@ | |||
| 1 | #include "frame.h" | ||
| 2 | #include "database.h" | ||
| 3 | #include "field.h" | ||
| 4 | |||
| 5 | namespace verbly { | ||
| 6 | namespace generator { | ||
| 7 | |||
| 8 | int frame::nextId_ = 0; | ||
| 9 | |||
| 10 | frame::frame() : id_(nextId_++) | ||
| 11 | { | ||
| 12 | } | ||
| 13 | |||
| 14 | void frame::push_back(part fp) | ||
| 15 | { | ||
| 16 | parts_.push_back(std::move(fp)); | ||
| 17 | } | ||
| 18 | |||
| 19 | database& operator<<(database& db, const frame& arg) | ||
| 20 | { | ||
| 21 | std::list<field> fields; | ||
| 22 | fields.emplace_back("frame_id", arg.getId()); | ||
| 23 | |||
| 24 | nlohmann::json jsonParts; | ||
| 25 | for (const part& p : arg) | ||
| 26 | { | ||
| 27 | nlohmann::json jsonPart; | ||
| 28 | jsonPart["type"] = static_cast<int>(p.getType()); | ||
| 29 | |||
| 30 | switch (p.getType()) | ||
| 31 | { | ||
| 32 | case part::type::noun_phrase: | ||
| 33 | { | ||
| 34 | jsonPart["role"] = p.getNounRole(); | ||
| 35 | jsonPart["selrestrs"] = p.getNounSelrestrs().toJson(); | ||
| 36 | jsonPart["synrestrs"] = p.getNounSynrestrs(); | ||
| 37 | |||
| 38 | break; | ||
| 39 | } | ||
| 40 | |||
| 41 | case part::type::preposition: | ||
| 42 | { | ||
| 43 | jsonPart["choices"] = p.getPrepositionChoices(); | ||
| 44 | jsonPart["literal"] = p.isPrepositionLiteral(); | ||
| 45 | |||
| 46 | break; | ||
| 47 | } | ||
| 48 | |||
| 49 | case part::type::literal: | ||
| 50 | { | ||
| 51 | jsonPart["value"] = p.getLiteralValue(); | ||
| 52 | |||
| 53 | break; | ||
| 54 | } | ||
| 55 | |||
| 56 | case part::type::verb: | ||
| 57 | case part::type::adjective: | ||
| 58 | case part::type::adverb: | ||
| 59 | { | ||
| 60 | break; | ||
| 61 | } | ||
| 62 | |||
| 63 | case part::type::invalid: | ||
| 64 | { | ||
| 65 | // Invalid parts should not be serialized. | ||
| 66 | assert(false); | ||
| 67 | |||
| 68 | break; | ||
| 69 | } | ||
| 70 | } | ||
| 71 | |||
| 72 | jsonParts.emplace_back(std::move(jsonPart)); | ||
| 73 | } | ||
| 74 | |||
| 75 | fields.emplace_back("data", jsonParts.dump()); | ||
| 76 | |||
| 77 | db.insertIntoTable("frames", std::move(fields)); | ||
| 78 | |||
| 79 | return db; | ||
| 80 | } | ||
| 81 | |||
| 82 | }; | ||
| 83 | }; | ||
| diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h | |||
| @@ -0,0 +1,59 @@ | |||
| 1 | #ifndef FRAME_H_26770FF1 | ||
| 2 | #define FRAME_H_26770FF1 | ||
| 3 | |||
| 4 | #include <list> | ||
| 5 | #include "part.h" | ||
| 6 | |||
| 7 | namespace verbly { | ||
| 8 | namespace generator { | ||
| 9 | |||
| 10 | class database; | ||
| 11 | |||
| 12 | class frame { | ||
| 13 | public: | ||
| 14 | |||
| 15 | // Aliases | ||
| 16 | |||
| 17 | using const_iterator = std::list<part>::const_iterator; | ||
| 18 | |||
| 19 | // Constructor | ||
| 20 | |||
| 21 | frame(); | ||
| 22 | |||
| 23 | // Mutators | ||
| 24 | |||
| 25 | void push_back(part fp); | ||
| 26 | |||
| 27 | // Accessors | ||
| 28 | |||
| 29 | int getId() const | ||
| 30 | { | ||
| 31 | return id_; | ||
| 32 | } | ||
| 33 | |||
| 34 | const_iterator begin() const | ||
| 35 | { | ||
| 36 | return std::begin(parts_); | ||
| 37 | } | ||
| 38 | |||
| 39 | const_iterator end() const | ||
| 40 | { | ||
| 41 | return std::end(parts_); | ||
| 42 | } | ||
| 43 | |||
| 44 | private: | ||
| 45 | |||
| 46 | static int nextId_; | ||
| 47 | |||
| 48 | const int id_; | ||
| 49 | |||
| 50 | std::list<part> parts_; | ||
| 51 | |||
| 52 | }; | ||
| 53 | |||
| 54 | database& operator<<(database& db, const frame& arg); | ||
| 55 | |||
| 56 | }; | ||
| 57 | }; | ||
| 58 | |||
| 59 | #endif /* end of include guard: FRAME_H_26770FF1 */ | ||
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -1,2320 +1,1477 @@ | |||
| 1 | #include <libxml/parser.h> | 1 | #include "generator.h" |
| 2 | #include <cassert> | ||
| 3 | #include <stdexcept> | ||
| 2 | #include <iostream> | 4 | #include <iostream> |
| 5 | #include <regex> | ||
| 3 | #include <dirent.h> | 6 | #include <dirent.h> |
| 4 | #include <set> | ||
| 5 | #include <map> | ||
| 6 | #include <string> | ||
| 7 | #include <vector> | ||
| 8 | #include <fstream> | 7 | #include <fstream> |
| 9 | #include <sqlite3.h> | 8 | #include "enums.h" |
| 10 | #include <sstream> | ||
| 11 | #include <regex> | ||
| 12 | #include <list> | ||
| 13 | #include <algorithm> | ||
| 14 | #include <json.hpp> | ||
| 15 | #include "progress.h" | 9 | #include "progress.h" |
| 10 | #include "selrestr.h" | ||
| 11 | #include "role.h" | ||
| 12 | #include "part.h" | ||
| 13 | #include "field.h" | ||
| 16 | #include "../lib/util.h" | 14 | #include "../lib/util.h" |
| 17 | 15 | ||
| 18 | using json = nlohmann::json; | 16 | namespace verbly { |
| 19 | 17 | namespace generator { | |
| 20 | struct verb_t { | ||
| 21 | std::string infinitive; | ||
| 22 | std::string past_tense; | ||
| 23 | std::string past_participle; | ||
| 24 | std::string ing_form; | ||
| 25 | std::string s_form; | ||
| 26 | int id; | ||
| 27 | }; | ||
| 28 | |||
| 29 | struct adjective_t { | ||
| 30 | std::string base; | ||
| 31 | std::string comparative; | ||
| 32 | std::string superlative; | ||
| 33 | }; | ||
| 34 | |||
| 35 | struct noun_t { | ||
| 36 | std::string singular; | ||
| 37 | std::string plural; | ||
| 38 | }; | ||
| 39 | |||
| 40 | struct selrestr_t { | ||
| 41 | enum class type_t { | ||
| 42 | singleton, | ||
| 43 | andlogic, | ||
| 44 | orlogic, | ||
| 45 | empty | ||
| 46 | }; | ||
| 47 | type_t type; | ||
| 48 | std::string restriction; | ||
| 49 | bool pos; | ||
| 50 | std::list<selrestr_t> subordinates; | ||
| 51 | }; | ||
| 52 | |||
| 53 | struct framepart_t { | ||
| 54 | enum class type_t { | ||
| 55 | np, | ||
| 56 | v, | ||
| 57 | pp, | ||
| 58 | adj, | ||
| 59 | adv, | ||
| 60 | lex | ||
| 61 | }; | ||
| 62 | type_t type; | ||
| 63 | std::string role; | ||
| 64 | selrestr_t selrestrs; | ||
| 65 | std::set<std::string> preprestrs; | ||
| 66 | std::set<std::string> synrestrs; | ||
| 67 | std::list<std::string> choices; | ||
| 68 | std::string lexval; | ||
| 69 | }; | ||
| 70 | |||
| 71 | struct group_t { | ||
| 72 | std::string id; | ||
| 73 | std::string parent; | ||
| 74 | std::set<std::string> members; | ||
| 75 | std::map<std::string, selrestr_t> roles; | ||
| 76 | std::list<std::list<framepart_t>> frames; | ||
| 77 | }; | ||
| 78 | |||
| 79 | struct pronunciation_t { | ||
| 80 | std::string phonemes; | ||
| 81 | std::string prerhyme; | ||
| 82 | std::string rhyme; | ||
| 83 | int syllables = 0; | ||
| 84 | std::string stress; | ||
| 85 | |||
| 86 | bool operator<(const pronunciation_t& other) const | ||
| 87 | { | ||
| 88 | return phonemes < other.phonemes; | ||
| 89 | } | ||
| 90 | }; | ||
| 91 | |||
| 92 | std::map<std::string, group_t> groups; | ||
| 93 | std::map<std::string, verb_t> verbs; | ||
| 94 | std::map<std::string, adjective_t> adjectives; | ||
| 95 | std::map<std::string, noun_t> nouns; | ||
| 96 | std::map<int, std::map<int, int>> wn; | ||
| 97 | std::map<int, int> images; | ||
| 98 | std::map<std::string, std::set<pronunciation_t>> pronunciations; | ||
| 99 | |||
| 100 | void print_usage() | ||
| 101 | { | ||
| 102 | std::cout << "Verbly Datafile Generator" << std::endl; | ||
| 103 | std::cout << "-------------------------" << std::endl; | ||
| 104 | std::cout << "Requires exactly six arguments." << std::endl; | ||
| 105 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | ||
| 106 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; | ||
| 107 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; | ||
| 108 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; | ||
| 109 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; | ||
| 110 | std::cout << "6. Datafile output path." << std::endl; | ||
| 111 | |||
| 112 | exit(1); | ||
| 113 | } | ||
| 114 | |||
| 115 | void db_error(sqlite3* ppdb, std::string query) | ||
| 116 | { | ||
| 117 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
| 118 | std::cout << query << std::endl; | ||
| 119 | sqlite3_close_v2(ppdb); | ||
| 120 | print_usage(); | ||
| 121 | } | ||
| 122 | |||
| 123 | json export_selrestrs(selrestr_t r) | ||
| 124 | { | ||
| 125 | if (r.type == selrestr_t::type_t::empty) | ||
| 126 | { | ||
| 127 | return {}; | ||
| 128 | } else if (r.type == selrestr_t::type_t::singleton) | ||
| 129 | { | ||
| 130 | json result; | ||
| 131 | result["type"] = r.restriction; | ||
| 132 | result["pos"] = r.pos; | ||
| 133 | return result; | ||
| 134 | } else { | ||
| 135 | json result; | ||
| 136 | if (r.type == selrestr_t::type_t::andlogic) | ||
| 137 | { | ||
| 138 | result["logic"] = "and"; | ||
| 139 | } else { | ||
| 140 | result["logic"] = "or"; | ||
| 141 | } | ||
| 142 | |||
| 143 | std::list<json> outlist; | ||
| 144 | std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); | ||
| 145 | result["children"] = outlist; | ||
| 146 | 18 | ||
| 147 | return result; | 19 | generator::generator( |
| 148 | } | 20 | std::string verbNetPath, |
| 149 | } | 21 | std::string agidPath, |
| 150 | 22 | std::string wordNetPath, | |
| 151 | selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) | 23 | std::string cmudictPath, |
| 152 | { | 24 | std::string imageNetPath, |
| 153 | selrestr_t r; | 25 | std::string outputPath) : |
| 154 | xmlChar* key; | 26 | verbNetPath_(verbNetPath), |
| 155 | 27 | agidPath_(agidPath), | |
| 156 | if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) | 28 | wordNetPath_(wordNetPath), |
| 157 | { | 29 | cmudictPath_(cmudictPath), |
| 158 | if (xmlChildElementCount(top) == 0) | 30 | imageNetPath_(imageNetPath), |
| 31 | db_(outputPath) | ||
| 159 | { | 32 | { |
| 160 | r.type = selrestr_t::type_t::empty; | 33 | // Ensure VerbNet directory exists |
| 161 | } else if (xmlChildElementCount(top) == 1) | 34 | DIR* dir; |
| 162 | { | 35 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
| 163 | r = parse_selrestrs(xmlFirstElementChild(top), filename); | ||
| 164 | } else { | ||
| 165 | r.type = selrestr_t::type_t::andlogic; | ||
| 166 | |||
| 167 | if (xmlHasProp(top, (const xmlChar*) "logic")) | ||
| 168 | { | 36 | { |
| 169 | key = xmlGetProp(top, (const xmlChar*) "logic"); | 37 | throw std::invalid_argument("Invalid VerbNet data directory"); |
| 170 | if (!xmlStrcmp(key, (const xmlChar*) "or")) | ||
| 171 | { | ||
| 172 | r.type = selrestr_t::type_t::orlogic; | ||
| 173 | } | ||
| 174 | xmlFree(key); | ||
| 175 | } | 38 | } |
| 176 | 39 | ||
| 177 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | 40 | closedir(dir); |
| 41 | |||
| 42 | // Ensure AGID infl.txt exists | ||
| 43 | if (!std::ifstream(agidPath_)) | ||
| 178 | { | 44 | { |
| 179 | if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) | 45 | throw std::invalid_argument("AGID infl.txt file not found"); |
| 180 | { | ||
| 181 | r.subordinates.push_back(parse_selrestrs(selrestr, filename)); | ||
| 182 | } | ||
| 183 | } | 46 | } |
| 184 | } | 47 | |
| 185 | } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) | 48 | // Add directory separator to WordNet path |
| 186 | { | 49 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) |
| 187 | r.type = selrestr_t::type_t::singleton; | ||
| 188 | |||
| 189 | key = xmlGetProp(top, (xmlChar*) "Value"); | ||
| 190 | r.pos = (std::string((const char*)key) == "+"); | ||
| 191 | xmlFree(key); | ||
| 192 | |||
| 193 | key = xmlGetProp(top, (xmlChar*) "type"); | ||
| 194 | r.restriction = (const char*) key; | ||
| 195 | xmlFree(key); | ||
| 196 | } else { | ||
| 197 | // Invalid | ||
| 198 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
| 199 | print_usage(); | ||
| 200 | } | ||
| 201 | |||
| 202 | return r; | ||
| 203 | } | ||
| 204 | |||
| 205 | group_t& parse_group(xmlNodePtr top, std::string filename) | ||
| 206 | { | ||
| 207 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | ||
| 208 | if (key == 0) | ||
| 209 | { | ||
| 210 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
| 211 | print_usage(); | ||
| 212 | } | ||
| 213 | std::string vnid = (const char*)key; | ||
| 214 | vnid = vnid.substr(vnid.find_first_of("-")+1); | ||
| 215 | xmlFree(key); | ||
| 216 | |||
| 217 | group_t g; | ||
| 218 | g.id = vnid; | ||
| 219 | |||
| 220 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
| 221 | { | ||
| 222 | if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) | ||
| 223 | { | ||
| 224 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) | ||
| 225 | { | 50 | { |
| 226 | if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) | 51 | wordNetPath_ += '/'; |
| 227 | { | ||
| 228 | auto& sg = parse_group(subclass, filename); | ||
| 229 | sg.parent = vnid; | ||
| 230 | |||
| 231 | for (auto member : sg.members) | ||
| 232 | { | ||
| 233 | g.members.insert(member); | ||
| 234 | } | ||
| 235 | |||
| 236 | // The schema requires that subclasses appear after role definitions, so we can do this now | ||
| 237 | for (auto role : g.roles) | ||
| 238 | { | ||
| 239 | if (sg.roles.count(role.first) == 0) | ||
| 240 | { | ||
| 241 | sg.roles[role.first] = role.second; | ||
| 242 | } | ||
| 243 | } | ||
| 244 | } | ||
| 245 | } | 52 | } |
| 246 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | 53 | |
| 247 | { | 54 | // Ensure WordNet tables exist |
| 248 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | 55 | for (std::string table : { |
| 56 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" | ||
| 57 | }) | ||
| 249 | { | 58 | { |
| 250 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | 59 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) |
| 251 | { | 60 | { |
| 252 | key = xmlGetProp(member, (xmlChar*) "name"); | 61 | throw std::invalid_argument("WordNet " + table + " table not found"); |
| 253 | g.members.insert((const char*)key); | ||
| 254 | xmlFree(key); | ||
| 255 | } | 62 | } |
| 256 | } | 63 | } |
| 257 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) | 64 | |
| 258 | { | 65 | // Ensure CMUDICT file exists |
| 259 | for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) | 66 | if (!std::ifstream(cmudictPath_)) |
| 260 | { | 67 | { |
| 261 | if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) | 68 | throw std::invalid_argument("CMUDICT file not found"); |
| 262 | { | ||
| 263 | selrestr_t r; | ||
| 264 | r.type = selrestr_t::type_t::empty; | ||
| 265 | |||
| 266 | key = xmlGetProp(role, (const xmlChar*) "type"); | ||
| 267 | std::string type = (const char*)key; | ||
| 268 | xmlFree(key); | ||
| 269 | |||
| 270 | for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
| 271 | { | ||
| 272 | if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) | ||
| 273 | { | ||
| 274 | r = parse_selrestrs(rolenode, filename); | ||
| 275 | } | ||
| 276 | } | ||
| 277 | |||
| 278 | g.roles[type] = r; | ||
| 279 | } | ||
| 280 | } | 69 | } |
| 281 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | 70 | |
| 282 | { | 71 | // Ensure ImageNet urls.txt exists |
| 283 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | 72 | if (!std::ifstream(imageNetPath_)) |
| 284 | { | 73 | { |
| 285 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | 74 | throw std::invalid_argument("ImageNet urls.txt file not found"); |
| 286 | { | ||
| 287 | std::list<framepart_t> f; | ||
| 288 | |||
| 289 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | ||
| 290 | { | ||
| 291 | if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) | ||
| 292 | { | ||
| 293 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
| 294 | { | ||
| 295 | framepart_t fp; | ||
| 296 | |||
| 297 | if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) | ||
| 298 | { | ||
| 299 | fp.type = framepart_t::type_t::np; | ||
| 300 | |||
| 301 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 302 | fp.role = (const char*)key; | ||
| 303 | xmlFree(key); | ||
| 304 | |||
| 305 | fp.selrestrs.type = selrestr_t::type_t::empty; | ||
| 306 | |||
| 307 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 308 | { | ||
| 309 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) | ||
| 310 | { | ||
| 311 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 312 | { | ||
| 313 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) | ||
| 314 | { | ||
| 315 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
| 316 | fp.synrestrs.insert(std::string((const char*)key)); | ||
| 317 | xmlFree(key); | ||
| 318 | } | ||
| 319 | } | ||
| 320 | } | ||
| 321 | |||
| 322 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
| 323 | { | ||
| 324 | fp.selrestrs = parse_selrestrs(npnode, filename); | ||
| 325 | } | ||
| 326 | } | ||
| 327 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) | ||
| 328 | { | ||
| 329 | fp.type = framepart_t::type_t::v; | ||
| 330 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) | ||
| 331 | { | ||
| 332 | fp.type = framepart_t::type_t::pp; | ||
| 333 | |||
| 334 | if (xmlHasProp(syntaxnode, (xmlChar*) "value")) | ||
| 335 | { | ||
| 336 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 337 | std::string choices = (const char*)key; | ||
| 338 | xmlFree(key); | ||
| 339 | |||
| 340 | fp.choices = verbly::split<std::list<std::string>>(choices, " "); | ||
| 341 | } | ||
| 342 | |||
| 343 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 344 | { | ||
| 345 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
| 346 | { | ||
| 347 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 348 | { | ||
| 349 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) | ||
| 350 | { | ||
| 351 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
| 352 | fp.preprestrs.insert(std::string((const char*)key)); | ||
| 353 | xmlFree(key); | ||
| 354 | } | ||
| 355 | } | ||
| 356 | } | ||
| 357 | } | ||
| 358 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) | ||
| 359 | { | ||
| 360 | fp.type = framepart_t::type_t::adj; | ||
| 361 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) | ||
| 362 | { | ||
| 363 | fp.type = framepart_t::type_t::adv; | ||
| 364 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) | ||
| 365 | { | ||
| 366 | fp.type = framepart_t::type_t::lex; | ||
| 367 | |||
| 368 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
| 369 | fp.lexval = (const char*)key; | ||
| 370 | xmlFree(key); | ||
| 371 | } else { | ||
| 372 | continue; | ||
| 373 | } | ||
| 374 | |||
| 375 | f.push_back(fp); | ||
| 376 | } | ||
| 377 | |||
| 378 | g.frames.push_back(f); | ||
| 379 | } | ||
| 380 | } | ||
| 381 | } | ||
| 382 | } | 75 | } |
| 383 | } | 76 | } |
| 384 | } | ||
| 385 | |||
| 386 | groups[vnid] = g; | ||
| 387 | |||
| 388 | return groups[vnid]; | ||
| 389 | } | ||
| 390 | |||
| 391 | int main(int argc, char** argv) | ||
| 392 | { | ||
| 393 | if (argc != 7) | ||
| 394 | { | ||
| 395 | print_usage(); | ||
| 396 | } | ||
| 397 | |||
| 398 | // VerbNet data | ||
| 399 | std::cout << "Reading verb frames..." << std::endl; | ||
| 400 | |||
| 401 | DIR* dir; | ||
| 402 | if ((dir = opendir(argv[1])) == nullptr) | ||
| 403 | { | ||
| 404 | std::cout << "Invalid VerbNet data directory." << std::endl; | ||
| 405 | |||
| 406 | print_usage(); | ||
| 407 | } | ||
| 408 | |||
| 409 | struct dirent* ent; | ||
| 410 | while ((ent = readdir(dir)) != nullptr) | ||
| 411 | { | ||
| 412 | std::string filename(argv[1]); | ||
| 413 | if (filename.back() != '/') | ||
| 414 | { | ||
| 415 | filename += '/'; | ||
| 416 | } | ||
| 417 | 77 | ||
| 418 | filename += ent->d_name; | 78 | void generator::run() |
| 419 | //std::cout << ent->d_name << std::endl; | ||
| 420 | |||
| 421 | if (filename.rfind(".xml") != filename.size() - 4) | ||
| 422 | { | ||
| 423 | continue; | ||
| 424 | } | ||
| 425 | |||
| 426 | xmlDocPtr doc = xmlParseFile(filename.c_str()); | ||
| 427 | if (doc == nullptr) | ||
| 428 | { | ||
| 429 | std::cout << "Error opening " << filename << std::endl; | ||
| 430 | print_usage(); | ||
| 431 | } | ||
| 432 | |||
| 433 | xmlNodePtr top = xmlDocGetRootElement(doc); | ||
| 434 | if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) | ||
| 435 | { | ||
| 436 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
| 437 | print_usage(); | ||
| 438 | } | ||
| 439 | |||
| 440 | parse_group(top, filename); | ||
| 441 | } | ||
| 442 | |||
| 443 | closedir(dir); | ||
| 444 | |||
| 445 | // Get verbs from AGID | ||
| 446 | std::cout << "Reading inflections..." << std::endl; | ||
| 447 | |||
| 448 | std::ifstream agidfile(argv[2]); | ||
| 449 | if (!agidfile.is_open()) | ||
| 450 | { | ||
| 451 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; | ||
| 452 | print_usage(); | ||
| 453 | } | ||
| 454 | |||
| 455 | for (;;) | ||
| 456 | { | ||
| 457 | std::string line; | ||
| 458 | if (!getline(agidfile, line)) | ||
| 459 | { | ||
| 460 | break; | ||
| 461 | } | ||
| 462 | |||
| 463 | if (line.back() == '\r') | ||
| 464 | { | 79 | { |
| 465 | line.pop_back(); | 80 | // Create notions, words, lemmas, and forms from WordNet synsets |
| 466 | } | 81 | readWordNetSynsets(); |
| 467 | 82 | ||
| 468 | int divider = line.find_first_of(" "); | 83 | // Reads adjective positioning WordNet data |
| 469 | std::string word = line.substr(0, divider); | 84 | readAdjectivePositioning(); |
| 470 | line = line.substr(divider+1); | 85 | |
| 471 | char type = line[0]; | 86 | // Counts the number of URLs ImageNet has per notion |
| 472 | 87 | readImageNetUrls(); | |
| 473 | if (line[1] == '?') | 88 | |
| 474 | { | 89 | // Creates a word by WordNet sense key lookup table |
| 475 | line.erase(0, 4); | 90 | readWordNetSenseKeys(); |
| 476 | } else { | 91 | |
| 477 | line.erase(0, 3); | 92 | // Creates groups and frames from VerbNet data |
| 478 | } | 93 | readVerbNet(); |
| 479 | 94 | ||
| 480 | std::vector<std::string> forms; | 95 | // Creates forms and inflections from AGID. To reduce the amount of forms |
| 481 | while (!line.empty()) | 96 | // created, we do this after most lemmas that need inflecting have been |
| 482 | { | 97 | // created through other means, and then only generate forms for |
| 483 | std::string inflection; | 98 | // inflections of already-existing lemmas. The exception to this regards |
| 484 | if ((divider = line.find(" | ")) != std::string::npos) | 99 | // verb lemmas. If a verb lemma in AGID either does not exist yet, or does |
| 485 | { | 100 | // exist but is not related to any words that are related to verb notions, |
| 486 | inflection = line.substr(0, divider); | 101 | // then a notion and a word is generated and the form generation proceeds |
| 487 | line = line.substr(divider + 3); | 102 | // as usual. |
| 488 | } else { | 103 | readAgidInflections(); |
| 489 | inflection = line; | 104 | |
| 490 | line = ""; | 105 | // Reads in prepositions and the is_a relationship |
| 491 | } | 106 | readPrepositions(); |
| 492 | 107 | ||
| 493 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | 108 | // Creates pronunciations from CMUDICT. To reduce the amount of |
| 494 | { | 109 | // pronunciations created, we do this after all forms have been created, |
| 495 | inflection = inflection.substr(0, divider); | 110 | // and then only generate pronunciations for already-exisiting forms. |
| 496 | } | 111 | readCmudictPronunciations(); |
| 497 | 112 | ||
| 498 | forms.push_back(inflection); | 113 | // Writes the database schema |
| 114 | writeSchema(); | ||
| 115 | |||
| 116 | // Dumps data to the database | ||
| 117 | dumpObjects(); | ||
| 118 | |||
| 119 | // Populates the antonymy relationship from WordNet | ||
| 120 | readWordNetAntonymy(); | ||
| 121 | |||
| 122 | // Populates the variation relationship from WordNet | ||
| 123 | readWordNetVariation(); | ||
| 124 | |||
| 125 | // Populates the usage, topicality, and regionality relationships from | ||
| 126 | // WordNet | ||
| 127 | readWordNetClasses(); | ||
| 128 | |||
| 129 | // Populates the causality relationship from WordNet | ||
| 130 | readWordNetCausality(); | ||
| 131 | |||
| 132 | // Populates the entailment relationship from WordNet | ||
| 133 | readWordNetEntailment(); | ||
| 134 | |||
| 135 | // Populates the hypernymy relationship from WordNet | ||
| 136 | readWordNetHypernymy(); | ||
| 137 | |||
| 138 | // Populates the instantiation relationship from WordNet | ||
| 139 | readWordNetInstantiation(); | ||
| 140 | |||
| 141 | // Populates the member meronymy relationship from WordNet | ||
| 142 | readWordNetMemberMeronymy(); | ||
| 143 | |||
| 144 | // Populates the part meronymy relationship from WordNet | ||
| 145 | readWordNetPartMeronymy(); | ||
| 146 | |||
| 147 | // Populates the substance meronymy relationship from WordNet | ||
| 148 | readWordNetSubstanceMeronymy(); | ||
| 149 | |||
| 150 | // Populates the pertainymy and mannernymy relationships from WordNet | ||
| 151 | readWordNetPertainymy(); | ||
| 152 | |||
| 153 | // Populates the specification relationship from WordNet | ||
| 154 | readWordNetSpecification(); | ||
| 155 | |||
| 156 | // Populates the adjective similarity relationship from WordNet | ||
| 157 | readWordNetSimilarity(); | ||
| 158 | |||
| 159 | |||
| 160 | |||
| 161 | |||
| 162 | |||
| 163 | |||
| 164 | |||
| 165 | |||
| 499 | } | 166 | } |
| 500 | 167 | ||
| 501 | switch (type) | 168 | void generator::readWordNetSynsets() |
| 502 | { | 169 | { |
| 503 | case 'V': | 170 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
| 171 | progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
| 172 | |||
| 173 | for (std::string line : lines) | ||
| 504 | { | 174 | { |
| 505 | verb_t v; | 175 | ppgs.update(); |
| 506 | v.infinitive = word; | 176 | |
| 507 | if (forms.size() == 4) | 177 | std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); |
| 508 | { | 178 | std::smatch relation_data; |
| 509 | v.past_tense = forms[0]; | 179 | if (!std::regex_search(line, relation_data, relation)) |
| 510 | v.past_participle = forms[1]; | 180 | { |
| 511 | v.ing_form = forms[2]; | 181 | continue; |
| 512 | v.s_form = forms[3]; | ||
| 513 | } else if (forms.size() == 3) | ||
| 514 | { | ||
| 515 | v.past_tense = forms[0]; | ||
| 516 | v.past_participle = forms[0]; | ||
| 517 | v.ing_form = forms[1]; | ||
| 518 | v.s_form = forms[2]; | ||
| 519 | } else if (forms.size() == 8) | ||
| 520 | { | ||
| 521 | // As of AGID 2014.08.11, this is only "to be" | ||
| 522 | v.past_tense = forms[0]; | ||
| 523 | v.past_participle = forms[2]; | ||
| 524 | v.ing_form = forms[3]; | ||
| 525 | v.s_form = forms[4]; | ||
| 526 | } else { | ||
| 527 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
| 528 | // - may and shall do not conjugate the way we want them to | ||
| 529 | // - methinks only has a past tense and is an outlier | ||
| 530 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
| 531 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
| 532 | } | 182 | } |
| 533 | 183 | ||
| 534 | verbs[word] = v; | 184 | int synset_id = std::stoi(relation_data[1]); |
| 535 | 185 | int wnum = std::stoi(relation_data[2]); | |
| 536 | break; | 186 | std::string text = relation_data[3]; |
| 537 | } | 187 | int tag_count = std::stoi(relation_data[4]); |
| 538 | 188 | size_t word_it; | |
| 539 | case 'A': | 189 | while ((word_it = text.find("''")) != std::string::npos) |
| 540 | { | ||
| 541 | adjective_t adj; | ||
| 542 | adj.base = word; | ||
| 543 | if (forms.size() == 2) | ||
| 544 | { | 190 | { |
| 545 | adj.comparative = forms[0]; | 191 | text.erase(word_it, 1); |
| 546 | adj.superlative = forms[1]; | ||
| 547 | } else { | ||
| 548 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | ||
| 549 | std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
| 550 | } | 192 | } |
| 551 | 193 | ||
| 552 | adjectives[word] = adj; | 194 | // The WordNet data does contain duplicates, so we need to check that we |
| 553 | 195 | // haven't already created this word. | |
| 554 | break; | 196 | std::pair<int, int> lookup(synset_id, wnum); |
| 555 | } | 197 | if (!wordByWnidAndWnum_.count(lookup)) |
| 556 | |||
| 557 | case 'N': | ||
| 558 | { | ||
| 559 | noun_t n; | ||
| 560 | n.singular = word; | ||
| 561 | if (forms.size() == 1) | ||
| 562 | { | 198 | { |
| 563 | n.plural = forms[0]; | 199 | notion& synset = lookupOrCreateNotion(synset_id); |
| 564 | } else { | 200 | lemma& lex = lookupOrCreateLemma(text); |
| 565 | // As of AGID 2014.08.11, this is non-existent. | 201 | word& entry = createWord(synset, lex, tag_count); |
| 566 | std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; | 202 | |
| 203 | wordByWnidAndWnum_[lookup] = &entry; | ||
| 567 | } | 204 | } |
| 568 | |||
| 569 | nouns[word] = n; | ||
| 570 | |||
| 571 | break; | ||
| 572 | } | 205 | } |
| 573 | } | 206 | } |
| 574 | } | ||
| 575 | |||
| 576 | // Pronounciations | ||
| 577 | std::cout << "Reading pronunciations..." << std::endl; | ||
| 578 | |||
| 579 | std::ifstream pronfile(argv[4]); | ||
| 580 | if (!pronfile.is_open()) | ||
| 581 | { | ||
| 582 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; | ||
| 583 | print_usage(); | ||
| 584 | } | ||
| 585 | |||
| 586 | for (;;) | ||
| 587 | { | ||
| 588 | std::string line; | ||
| 589 | if (!getline(pronfile, line)) | ||
| 590 | { | ||
| 591 | break; | ||
| 592 | } | ||
| 593 | |||
| 594 | if (line.back() == '\r') | ||
| 595 | { | ||
| 596 | line.pop_back(); | ||
| 597 | } | ||
| 598 | 207 | ||
| 599 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | 208 | void generator::readAdjectivePositioning() |
| 600 | std::smatch phoneme_data; | ||
| 601 | if (std::regex_search(line, phoneme_data, phoneme)) | ||
| 602 | { | 209 | { |
| 603 | std::string canonical(phoneme_data[1]); | 210 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); |
| 604 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | 211 | progress ppgs("Reading adjective positionings from WordNet...", lines.size()); |
| 605 | |||
| 606 | std::string phonemes = phoneme_data[2]; | ||
| 607 | auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " "); | ||
| 608 | auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { | ||
| 609 | return phoneme.find("1") != std::string::npos; | ||
| 610 | }); | ||
| 611 | 212 | ||
| 612 | pronunciation_t p; | 213 | for (std::string line : lines) |
| 613 | p.phonemes = phonemes; | ||
| 614 | |||
| 615 | // Rhyme detection | ||
| 616 | if (phemstrt != std::end(phoneme_set)) | ||
| 617 | { | 214 | { |
| 618 | std::stringstream rhymer; | 215 | ppgs.update(); |
| 619 | for (auto it = phemstrt; it != std::end(phoneme_set); it++) | ||
| 620 | { | ||
| 621 | std::string naked; | ||
| 622 | std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { | ||
| 623 | return isdigit(ch); | ||
| 624 | }); | ||
| 625 | |||
| 626 | if (it != phemstrt) | ||
| 627 | { | ||
| 628 | rhymer << " "; | ||
| 629 | } | ||
| 630 | |||
| 631 | rhymer << naked; | ||
| 632 | } | ||
| 633 | 216 | ||
| 634 | p.rhyme = rhymer.str(); | 217 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); |
| 635 | 218 | std::smatch relation_data; | |
| 636 | if (phemstrt != std::begin(phoneme_set)) | 219 | if (!std::regex_search(line, relation_data, relation)) |
| 637 | { | 220 | { |
| 638 | phemstrt--; | 221 | continue; |
| 639 | p.prerhyme = *phemstrt; | ||
| 640 | } else { | ||
| 641 | p.prerhyme = ""; | ||
| 642 | } | 222 | } |
| 643 | } else { | ||
| 644 | p.prerhyme = ""; | ||
| 645 | p.rhyme = ""; | ||
| 646 | } | ||
| 647 | 223 | ||
| 648 | // Syllable/stress | 224 | int synset_id = stoi(relation_data[1]); |
| 649 | for (auto phm : phoneme_set) | 225 | int wnum = stoi(relation_data[2]); |
| 650 | { | 226 | std::string adjpos_str = relation_data[3]; |
| 651 | if (isdigit(phm.back())) | ||
| 652 | { | ||
| 653 | // It's a vowel! | ||
| 654 | p.syllables++; | ||
| 655 | 227 | ||
| 656 | if (phm.back() == '1') | 228 | std::pair<int, int> lookup(synset_id, wnum); |
| 229 | if (wordByWnidAndWnum_.count(lookup)) | ||
| 230 | { | ||
| 231 | word& adj = *wordByWnidAndWnum_.at(lookup); | ||
| 232 | |||
| 233 | if (adjpos_str == "p") | ||
| 234 | { | ||
| 235 | adj.setAdjectivePosition(positioning::predicate); | ||
| 236 | } else if (adjpos_str == "a") | ||
| 237 | { | ||
| 238 | adj.setAdjectivePosition(positioning::attributive); | ||
| 239 | } else if (adjpos_str == "i") | ||
| 657 | { | 240 | { |
| 658 | p.stress.push_back('1'); | 241 | adj.setAdjectivePosition(positioning::postnominal); |
| 659 | } else { | 242 | } else { |
| 660 | p.stress.push_back('0'); | 243 | // Can't happen because of how we specified the regex. |
| 244 | assert(false); | ||
| 661 | } | 245 | } |
| 662 | } | 246 | } |
| 663 | } | 247 | } |
| 664 | |||
| 665 | pronunciations[canonical].insert(p); | ||
| 666 | } | ||
| 667 | } | ||
| 668 | |||
| 669 | // Images | ||
| 670 | std::cout << "Reading images..." << std::endl; | ||
| 671 | |||
| 672 | std::ifstream imagefile(argv[5]); | ||
| 673 | if (!imagefile.is_open()) | ||
| 674 | { | ||
| 675 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
| 676 | print_usage(); | ||
| 677 | } | ||
| 678 | |||
| 679 | for (;;) | ||
| 680 | { | ||
| 681 | std::string line; | ||
| 682 | if (!getline(imagefile, line)) | ||
| 683 | { | ||
| 684 | break; | ||
| 685 | } | ||
| 686 | |||
| 687 | if (line.back() == '\r') | ||
| 688 | { | ||
| 689 | line.pop_back(); | ||
| 690 | } | ||
| 691 | |||
| 692 | std::string wnid_s = line.substr(1, 8); | ||
| 693 | int wnid = stoi(wnid_s) + 100000000; | ||
| 694 | images[wnid]++; | ||
| 695 | } | ||
| 696 | |||
| 697 | imagefile.close(); | ||
| 698 | |||
| 699 | // Start writing output | ||
| 700 | std::cout << "Writing schema..." << std::endl; | ||
| 701 | |||
| 702 | sqlite3* ppdb; | ||
| 703 | if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
| 704 | { | ||
| 705 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | ||
| 706 | print_usage(); | ||
| 707 | } | ||
| 708 | |||
| 709 | std::ifstream schemafile("schema.sql"); | ||
| 710 | if (!schemafile.is_open()) | ||
| 711 | { | ||
| 712 | std::cout << "Could not find schema file" << std::endl; | ||
| 713 | print_usage(); | ||
| 714 | } | ||
| 715 | |||
| 716 | std::stringstream schemabuilder; | ||
| 717 | for (;;) | ||
| 718 | { | ||
| 719 | std::string line; | ||
| 720 | if (!getline(schemafile, line)) | ||
| 721 | { | ||
| 722 | break; | ||
| 723 | } | ||
| 724 | |||
| 725 | if (line.back() == '\r') | ||
| 726 | { | ||
| 727 | line.pop_back(); | ||
| 728 | } | ||
| 729 | |||
| 730 | schemabuilder << line << std::endl; | ||
| 731 | } | ||
| 732 | |||
| 733 | std::string schema = schemabuilder.str(); | ||
| 734 | while (!schema.empty()) | ||
| 735 | { | ||
| 736 | std::string query; | ||
| 737 | int divider = schema.find(";"); | ||
| 738 | if (divider != std::string::npos) | ||
| 739 | { | ||
| 740 | query = schema.substr(0, divider+1); | ||
| 741 | schema = schema.substr(divider+2); | ||
| 742 | } else { | ||
| 743 | break; | ||
| 744 | } | 248 | } |
| 745 | 249 | ||
| 746 | sqlite3_stmt* schmstmt; | 250 | void generator::readImageNetUrls() |
| 747 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | ||
| 748 | { | 251 | { |
| 749 | db_error(ppdb, query); | 252 | // The ImageNet datafile is so large that it is unreasonable and |
| 750 | } | 253 | // unnecessary to read it into memory; instead, we will parse each line as |
| 751 | 254 | // we read it. This has the caveat that we cannot display a progress bar. | |
| 752 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | 255 | std::cout << "Reading image counts from ImageNet..." << std::endl; |
| 753 | { | ||
| 754 | db_error(ppdb, query); | ||
| 755 | } | ||
| 756 | |||
| 757 | sqlite3_finalize(schmstmt); | ||
| 758 | } | ||
| 759 | |||
| 760 | std::cout << "Writing prepositions..." << std::endl; | ||
| 761 | std::ifstream prepfile("prepositions.txt"); | ||
| 762 | if (!prepfile.is_open()) | ||
| 763 | { | ||
| 764 | std::cout << "Could not find prepositions file" << std::endl; | ||
| 765 | print_usage(); | ||
| 766 | } | ||
| 767 | |||
| 768 | for (;;) | ||
| 769 | { | ||
| 770 | std::string line; | ||
| 771 | if (!getline(prepfile, line)) | ||
| 772 | { | ||
| 773 | break; | ||
| 774 | } | ||
| 775 | |||
| 776 | if (line.back() == '\r') | ||
| 777 | { | ||
| 778 | line.pop_back(); | ||
| 779 | } | ||
| 780 | |||
| 781 | std::regex relation("^([^:]+): (.+)"); | ||
| 782 | std::smatch relation_data; | ||
| 783 | std::regex_search(line, relation_data, relation); | ||
| 784 | std::string prep = relation_data[1]; | ||
| 785 | std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", "); | ||
| 786 | |||
| 787 | std::string query("INSERT INTO prepositions (form) VALUES (?)"); | ||
| 788 | sqlite3_stmt* ppstmt; | ||
| 789 | |||
| 790 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 791 | { | ||
| 792 | db_error(ppdb, query); | ||
| 793 | } | ||
| 794 | |||
| 795 | sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); | ||
| 796 | |||
| 797 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 798 | { | ||
| 799 | db_error(ppdb, query); | ||
| 800 | } | ||
| 801 | |||
| 802 | sqlite3_finalize(ppstmt); | ||
| 803 | |||
| 804 | query = "SELECT last_insert_rowid()"; | ||
| 805 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 806 | { | ||
| 807 | db_error(ppdb, query); | ||
| 808 | } | ||
| 809 | |||
| 810 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
| 811 | { | ||
| 812 | db_error(ppdb, query); | ||
| 813 | } | ||
| 814 | |||
| 815 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
| 816 | sqlite3_finalize(ppstmt); | ||
| 817 | |||
| 818 | for (auto group : groups) | ||
| 819 | { | ||
| 820 | query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; | ||
| 821 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 822 | { | ||
| 823 | db_error(ppdb, query); | ||
| 824 | } | ||
| 825 | 256 | ||
| 826 | sqlite3_bind_int(ppstmt, 1, rowid); | 257 | std::ifstream file(imageNetPath_); |
| 827 | sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); | 258 | if (!file) |
| 828 | |||
| 829 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 830 | { | 259 | { |
| 831 | db_error(ppdb, query); | 260 | throw std::invalid_argument("Could not find file " + imageNetPath_); |
| 832 | } | 261 | } |
| 833 | |||
| 834 | sqlite3_finalize(ppstmt); | ||
| 835 | } | ||
| 836 | } | ||
| 837 | |||
| 838 | 262 | ||
| 839 | { | 263 | std::string line; |
| 840 | progress ppgs("Writing verbs...", verbs.size()); | 264 | while (std::getline(file, line)) |
| 841 | for (auto& mapping : verbs) | ||
| 842 | { | ||
| 843 | sqlite3_stmt* ppstmt; | ||
| 844 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | ||
| 845 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 846 | { | ||
| 847 | db_error(ppdb, query); | ||
| 848 | } | ||
| 849 | |||
| 850 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); | ||
| 851 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); | ||
| 852 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); | ||
| 853 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); | ||
| 854 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); | ||
| 855 | |||
| 856 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 857 | { | ||
| 858 | db_error(ppdb, query); | ||
| 859 | } | ||
| 860 | |||
| 861 | sqlite3_finalize(ppstmt); | ||
| 862 | |||
| 863 | std::string canonical(mapping.second.infinitive); | ||
| 864 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
| 865 | if (pronunciations.count(canonical) == 1) | ||
| 866 | { | 265 | { |
| 867 | query = "SELECT last_insert_rowid()"; | 266 | if (line.back() == '\r') |
| 868 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 869 | { | 267 | { |
| 870 | db_error(ppdb, query); | 268 | line.pop_back(); |
| 871 | } | 269 | } |
| 872 | 270 | ||
| 873 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | 271 | std::string wnid_s = line.substr(1, 8); |
| 272 | int wnid = stoi(wnid_s) + 100000000; | ||
| 273 | if (notionByWnid_.count(wnid)) | ||
| 874 | { | 274 | { |
| 875 | db_error(ppdb, query); | 275 | // We know that this notion has a wnid and is a noun. |
| 876 | } | 276 | notionByWnid_.at(wnid)->incrementNumOfImages(); |
| 877 | |||
| 878 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
| 879 | |||
| 880 | sqlite3_finalize(ppstmt); | ||
| 881 | |||
| 882 | mapping.second.id = rowid; | ||
| 883 | |||
| 884 | for (auto pronunciation : pronunciations[canonical]) | ||
| 885 | { | ||
| 886 | if (!pronunciation.rhyme.empty()) | ||
| 887 | { | ||
| 888 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
| 889 | } else { | ||
| 890 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
| 891 | } | ||
| 892 | |||
| 893 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 894 | { | ||
| 895 | db_error(ppdb, query); | ||
| 896 | } | ||
| 897 | |||
| 898 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
| 899 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
| 900 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
| 901 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
| 902 | |||
| 903 | if (!pronunciation.rhyme.empty()) | ||
| 904 | { | ||
| 905 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
| 906 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
| 907 | } | ||
| 908 | |||
| 909 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 910 | { | ||
| 911 | db_error(ppdb, query); | ||
| 912 | } | ||
| 913 | |||
| 914 | sqlite3_finalize(ppstmt); | ||
| 915 | } | 277 | } |
| 916 | } | 278 | } |
| 917 | |||
| 918 | ppgs.update(); | ||
| 919 | } | 279 | } |
| 920 | } | 280 | |
| 921 | 281 | void generator::readWordNetSenseKeys() | |
| 922 | { | ||
| 923 | progress ppgs("Writing verb frames...", groups.size()); | ||
| 924 | for (auto& mapping : groups) | ||
| 925 | { | 282 | { |
| 926 | std::list<json> roledatal; | 283 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); |
| 927 | std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { | 284 | progress ppgs("Reading sense keys from WordNet...", lines.size()); |
| 928 | json role; | ||
| 929 | role["type"] = r.first; | ||
| 930 | role["selrestrs"] = export_selrestrs(r.second); | ||
| 931 | |||
| 932 | return role; | ||
| 933 | }); | ||
| 934 | |||
| 935 | json roledata(roledatal); | ||
| 936 | std::string rdm = roledata.dump(); | ||
| 937 | |||
| 938 | sqlite3_stmt* ppstmt; | ||
| 939 | std::string query("INSERT INTO groups (data) VALUES (?)"); | ||
| 940 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 941 | { | ||
| 942 | db_error(ppdb, query); | ||
| 943 | } | ||
| 944 | |||
| 945 | sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); | ||
| 946 | |||
| 947 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 948 | { | ||
| 949 | db_error(ppdb, query); | ||
| 950 | } | ||
| 951 | 285 | ||
| 952 | sqlite3_finalize(ppstmt); | 286 | for (std::string line : lines) |
| 953 | |||
| 954 | query = "SELECT last_insert_rowid()"; | ||
| 955 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 956 | { | ||
| 957 | db_error(ppdb, query); | ||
| 958 | } | ||
| 959 | |||
| 960 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
| 961 | { | ||
| 962 | db_error(ppdb, query); | ||
| 963 | } | ||
| 964 | |||
| 965 | int gid = sqlite3_column_int(ppstmt, 0); | ||
| 966 | sqlite3_finalize(ppstmt); | ||
| 967 | |||
| 968 | for (auto frame : mapping.second.frames) | ||
| 969 | { | 287 | { |
| 970 | std::list<json> fdatap; | 288 | ppgs.update(); |
| 971 | std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { | ||
| 972 | json part; | ||
| 973 | |||
| 974 | switch (fp.type) | ||
| 975 | { | ||
| 976 | case framepart_t::type_t::np: | ||
| 977 | { | ||
| 978 | part["type"] = "np"; | ||
| 979 | part["role"] = fp.role; | ||
| 980 | part["selrestrs"] = export_selrestrs(fp.selrestrs); | ||
| 981 | part["synrestrs"] = fp.synrestrs; | ||
| 982 | |||
| 983 | break; | ||
| 984 | } | ||
| 985 | |||
| 986 | case framepart_t::type_t::pp: | ||
| 987 | { | ||
| 988 | part["type"] = "pp"; | ||
| 989 | part["values"] = fp.choices; | ||
| 990 | part["preprestrs"] = fp.preprestrs; | ||
| 991 | |||
| 992 | break; | ||
| 993 | } | ||
| 994 | |||
| 995 | case framepart_t::type_t::v: | ||
| 996 | { | ||
| 997 | part["type"] = "v"; | ||
| 998 | |||
| 999 | break; | ||
| 1000 | } | ||
| 1001 | |||
| 1002 | case framepart_t::type_t::adj: | ||
| 1003 | { | ||
| 1004 | part["type"] = "adj"; | ||
| 1005 | |||
| 1006 | break; | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | case framepart_t::type_t::adv: | ||
| 1010 | { | ||
| 1011 | part["type"] = "adv"; | ||
| 1012 | |||
| 1013 | break; | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | case framepart_t::type_t::lex: | ||
| 1017 | { | ||
| 1018 | part["type"] = "lex"; | ||
| 1019 | part["value"] = fp.lexval; | ||
| 1020 | |||
| 1021 | break; | ||
| 1022 | } | ||
| 1023 | } | ||
| 1024 | |||
| 1025 | return part; | ||
| 1026 | }); | ||
| 1027 | |||
| 1028 | json fdata(fdatap); | ||
| 1029 | std::string marshall = fdata.dump(); | ||
| 1030 | |||
| 1031 | query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; | ||
| 1032 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1033 | { | ||
| 1034 | db_error(ppdb, query); | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | sqlite3_bind_int(ppstmt, 1, gid); | ||
| 1038 | sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); | ||
| 1039 | 289 | ||
| 1040 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 290 | // We only actually need to lookup verbs by sense key so we'll just |
| 291 | // ignore everything that isn't a verb. | ||
| 292 | std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); | ||
| 293 | std::smatch relation_data; | ||
| 294 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1041 | { | 295 | { |
| 1042 | db_error(ppdb, query); | 296 | continue; |
| 1043 | } | 297 | } |
| 298 | |||
| 299 | int synset_id = stoi(relation_data[1]); | ||
| 300 | int wnum = stoi(relation_data[2]); | ||
| 301 | std::string sense_key = relation_data[3]; | ||
| 1044 | 302 | ||
| 1045 | sqlite3_finalize(ppstmt); | 303 | // We are treating this mapping as injective, which is not entirely |
| 1046 | } | 304 | // accurate. First, the WordNet table contains duplicate rows, so those |
| 1047 | 305 | // need to be ignored. More importantly, a small number of sense keys | |
| 1048 | for (auto member : mapping.second.members) | 306 | // (one for each letter of the Latin alphabet, plus 9 other words) each |
| 1049 | { | 307 | // map to two different words in the same synset which differ only by |
| 1050 | if (verbs.count(member) == 1) | 308 | // capitalization. Luckily, none of these exceptions are verbs, so we |
| 309 | // can pretend that the mapping is injective. | ||
| 310 | if (!wnSenseKeys_.count(sense_key)) | ||
| 1051 | { | 311 | { |
| 1052 | auto& v = verbs[member]; | 312 | std::pair<int, int> lookup(synset_id, wnum); |
| 1053 | 313 | if (wordByWnidAndWnum_.count(lookup)) | |
| 1054 | query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; | ||
| 1055 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1056 | { | ||
| 1057 | db_error(ppdb, query); | ||
| 1058 | } | ||
| 1059 | |||
| 1060 | sqlite3_bind_int(ppstmt, 1, v.id); | ||
| 1061 | sqlite3_bind_int(ppstmt, 2, gid); | ||
| 1062 | |||
| 1063 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 1064 | { | 314 | { |
| 1065 | db_error(ppdb, query); | 315 | wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); |
| 1066 | } | 316 | } |
| 1067 | |||
| 1068 | sqlite3_finalize(ppstmt); | ||
| 1069 | } | 317 | } |
| 1070 | } | 318 | } |
| 1071 | |||
| 1072 | ppgs.update(); | ||
| 1073 | } | 319 | } |
| 1074 | } | 320 | |
| 1075 | 321 | void generator::readVerbNet() | |
| 1076 | // Get nouns/adjectives/adverbs from WordNet | ||
| 1077 | // Useful relations: | ||
| 1078 | // - s: master list | ||
| 1079 | // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) | ||
| 1080 | // - at: variation (e.g. a measurement can be standard or nonstandard) | ||
| 1081 | // - der: derivation (e.g. happy/happily, happily/happy) | ||
| 1082 | // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) | ||
| 1083 | // - ins: instantiation (do we need this? let's see) | ||
| 1084 | // - mm: member meronymy/holonymy (e.g. family/mother, family/child) | ||
| 1085 | // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) | ||
| 1086 | // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) | ||
| 1087 | // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) | ||
| 1088 | // mannernymy (e.g. something done quickly is done in a manner that is quick) | ||
| 1089 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | ||
| 1090 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | ||
| 1091 | // - syntax: positioning flags for some adjectives | ||
| 1092 | std::string wnpref {argv[3]}; | ||
| 1093 | if (wnpref.back() != '/') | ||
| 1094 | { | ||
| 1095 | wnpref += '/'; | ||
| 1096 | } | ||
| 1097 | |||
| 1098 | // s table | ||
| 1099 | { | ||
| 1100 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
| 1101 | if (!wnsfile.is_open()) | ||
| 1102 | { | 322 | { |
| 1103 | std::cout << "Invalid WordNet data directory." << std::endl; | 323 | std::cout << "Reading frames from VerbNet..." << std::endl; |
| 1104 | print_usage(); | ||
| 1105 | } | ||
| 1106 | 324 | ||
| 1107 | std::list<std::string> lines; | 325 | DIR* dir; |
| 1108 | for (;;) | 326 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
| 1109 | { | ||
| 1110 | std::string line; | ||
| 1111 | if (!getline(wnsfile, line)) | ||
| 1112 | { | 327 | { |
| 1113 | break; | 328 | throw std::invalid_argument("Invalid VerbNet data directory"); |
| 1114 | } | 329 | } |
| 1115 | 330 | ||
| 1116 | if (line.back() == '\r') | 331 | struct dirent* ent; |
| 1117 | { | 332 | while ((ent = readdir(dir)) != nullptr) |
| 1118 | line.pop_back(); | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | lines.push_back(line); | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); | ||
| 1125 | for (auto line : lines) | ||
| 1126 | { | ||
| 1127 | ppgs.update(); | ||
| 1128 | |||
| 1129 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); | ||
| 1130 | std::smatch relation_data; | ||
| 1131 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1132 | { | 333 | { |
| 1133 | continue; | 334 | std::string filename(verbNetPath_); |
| 1134 | } | 335 | |
| 336 | if (filename.back() != '/') | ||
| 337 | { | ||
| 338 | filename += '/'; | ||
| 339 | } | ||
| 1135 | 340 | ||
| 1136 | int synset_id = stoi(relation_data[1]); | 341 | filename += ent->d_name; |
| 1137 | int wnum = stoi(relation_data[2]); | ||
| 1138 | std::string word = relation_data[3]; | ||
| 1139 | size_t word_it; | ||
| 1140 | while ((word_it = word.find("''")) != std::string::npos) | ||
| 1141 | { | ||
| 1142 | word.erase(word_it, 1); | ||
| 1143 | } | ||
| 1144 | 342 | ||
| 1145 | std::string query; | 343 | if (filename.rfind(".xml") != filename.size() - 4) |
| 1146 | switch (synset_id / 100000000) | ||
| 1147 | { | ||
| 1148 | case 1: // Noun | ||
| 1149 | { | 344 | { |
| 1150 | if (nouns.count(word) == 1) | 345 | continue; |
| 1151 | { | ||
| 1152 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; | ||
| 1153 | } else { | ||
| 1154 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | break; | ||
| 1158 | } | 346 | } |
| 1159 | 347 | ||
| 1160 | case 2: // Verb | 348 | xmlDocPtr doc = xmlParseFile(filename.c_str()); |
| 349 | if (doc == nullptr) | ||
| 1161 | { | 350 | { |
| 1162 | // Ignore | 351 | throw std::logic_error("Error opening " + filename); |
| 1163 | |||
| 1164 | break; | ||
| 1165 | } | 352 | } |
| 1166 | 353 | ||
| 1167 | case 3: // Adjective | 354 | xmlNodePtr top = xmlDocGetRootElement(doc); |
| 355 | if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | ||
| 1168 | { | 356 | { |
| 1169 | if (adjectives.count(word) == 1) | 357 | throw std::logic_error("Bad VerbNet file format: " + filename); |
| 1170 | { | ||
| 1171 | query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | ||
| 1172 | } else { | ||
| 1173 | query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | break; | ||
| 1177 | } | 358 | } |
| 1178 | 359 | ||
| 1179 | case 4: // Adverb | 360 | try |
| 1180 | { | 361 | { |
| 1181 | if (adjectives.count(word) == 1) | 362 | createGroup(top); |
| 1182 | { | 363 | } catch (const std::exception& e) |
| 1183 | query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | 364 | { |
| 1184 | } else { | 365 | std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); |
| 1185 | query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; | ||
| 1186 | } | ||
| 1187 | |||
| 1188 | break; | ||
| 1189 | } | 366 | } |
| 1190 | } | 367 | } |
| 368 | |||
| 369 | closedir(dir); | ||
| 370 | } | ||
| 1191 | 371 | ||
| 1192 | sqlite3_stmt* ppstmt; | 372 | void generator::readAgidInflections() |
| 1193 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 373 | { |
| 374 | std::list<std::string> lines(readFile(agidPath_)); | ||
| 375 | progress ppgs("Reading inflections from AGID...", lines.size()); | ||
| 376 | |||
| 377 | for (std::string line : lines) | ||
| 1194 | { | 378 | { |
| 1195 | db_error(ppdb, query); | 379 | ppgs.update(); |
| 1196 | } | 380 | |
| 381 | int divider = line.find_first_of(" "); | ||
| 382 | std::string infinitive = line.substr(0, divider); | ||
| 383 | line = line.substr(divider+1); | ||
| 384 | char type = line[0]; | ||
| 1197 | 385 | ||
| 1198 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); | 386 | if (line[1] == '?') |
| 1199 | switch (synset_id / 100000000) | ||
| 1200 | { | ||
| 1201 | case 1: // Noun | ||
| 1202 | { | 387 | { |
| 1203 | sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { | 388 | line.erase(0, 4); |
| 1204 | return isupper(ch); | 389 | } else { |
| 1205 | }) ? 1 : 0)); | 390 | line.erase(0, 3); |
| 1206 | |||
| 1207 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | ||
| 1208 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
| 1209 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
| 1210 | |||
| 1211 | if (nouns.count(word) == 1) | ||
| 1212 | { | ||
| 1213 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); | ||
| 1214 | } | ||
| 1215 | |||
| 1216 | break; | ||
| 1217 | } | 391 | } |
| 1218 | 392 | ||
| 1219 | case 3: // Adjective | 393 | if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) |
| 1220 | case 4: // Adverb | ||
| 1221 | { | 394 | { |
| 1222 | sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); | 395 | continue; |
| 1223 | 396 | } | |
| 1224 | if (adjectives.count(word) == 1) | 397 | |
| 398 | lemma& curLemma = lookupOrCreateLemma(infinitive); | ||
| 399 | |||
| 400 | auto forms = split<std::vector<std::string>>(line, " | "); | ||
| 401 | for (std::string& inflForm : forms) | ||
| 402 | { | ||
| 403 | int sympos = inflForm.find_first_of(",?"); | ||
| 404 | if (sympos != std::string::npos) | ||
| 1225 | { | 405 | { |
| 1226 | sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); | 406 | inflForm = inflForm.substr(0, sympos); |
| 1227 | sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); | ||
| 1228 | } | 407 | } |
| 1229 | |||
| 1230 | break; | ||
| 1231 | } | 408 | } |
| 1232 | } | ||
| 1233 | 409 | ||
| 1234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 410 | switch (type) |
| 1235 | { | ||
| 1236 | db_error(ppdb, query); | ||
| 1237 | } | ||
| 1238 | |||
| 1239 | sqlite3_finalize(ppstmt); | ||
| 1240 | |||
| 1241 | query = "SELECT last_insert_rowid()"; | ||
| 1242 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1243 | { | ||
| 1244 | db_error(ppdb, query); | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
| 1248 | { | ||
| 1249 | db_error(ppdb, query); | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
| 1253 | wn[synset_id][wnum] = rowid; | ||
| 1254 | |||
| 1255 | sqlite3_finalize(ppstmt); | ||
| 1256 | |||
| 1257 | std::string canonical(word); | ||
| 1258 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
| 1259 | if (pronunciations.count(canonical) == 1) | ||
| 1260 | { | ||
| 1261 | for (auto pronunciation : pronunciations[canonical]) | ||
| 1262 | { | 411 | { |
| 1263 | switch (synset_id / 100000000) | 412 | case 'V': |
| 1264 | { | 413 | { |
| 1265 | case 1: // Noun | 414 | if (forms.size() == 4) |
| 1266 | { | 415 | { |
| 1267 | if (!pronunciation.rhyme.empty()) | 416 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
| 1268 | { | 417 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); |
| 1269 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 418 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); |
| 1270 | } else { | 419 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); |
| 1271 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 420 | } else if (forms.size() == 3) |
| 1272 | } | ||
| 1273 | |||
| 1274 | break; | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | case 3: // Adjective | ||
| 1278 | { | 421 | { |
| 1279 | if (!pronunciation.rhyme.empty()) | 422 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
| 1280 | { | 423 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); |
| 1281 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 424 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); |
| 1282 | } else { | 425 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); |
| 1283 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 426 | } else if (forms.size() == 8) |
| 1284 | } | 427 | { |
| 1285 | 428 | // As of AGID 2014.08.11, this is only "to be" | |
| 1286 | break; | 429 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
| 430 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); | ||
| 431 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); | ||
| 432 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); | ||
| 433 | } else { | ||
| 434 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
| 435 | // - may and shall do not conjugate the way we want them to | ||
| 436 | // - methinks only has a past tense and is an outlier | ||
| 437 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
| 438 | std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
| 1287 | } | 439 | } |
| 1288 | 440 | ||
| 1289 | case 4: // Adverb | 441 | // For verbs in particular, we sometimes create a notion and a word |
| 442 | // from inflection data. Specifically, if there are not yet any | ||
| 443 | // verbs existing that have the same infinitive form. "Yet" means | ||
| 444 | // that this verb appears in the AGID data but not in either WordNet | ||
| 445 | // or VerbNet. | ||
| 446 | if (!wordsByBaseForm_.count(infinitive) | ||
| 447 | || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { | ||
| 448 | return w->getNotion().getPartOfSpeech() == part_of_speech::verb; | ||
| 449 | })) | ||
| 1290 | { | 450 | { |
| 1291 | if (!pronunciation.rhyme.empty()) | 451 | notion& n = createNotion(part_of_speech::verb); |
| 1292 | { | 452 | createWord(n, curLemma); |
| 1293 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
| 1294 | } else { | ||
| 1295 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
| 1296 | } | ||
| 1297 | |||
| 1298 | break; | ||
| 1299 | } | 453 | } |
| 1300 | } | ||
| 1301 | |||
| 1302 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1303 | { | ||
| 1304 | db_error(ppdb, query); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
| 1308 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
| 1309 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
| 1310 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
| 1311 | |||
| 1312 | if (!pronunciation.rhyme.empty()) | ||
| 1313 | { | ||
| 1314 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
| 1315 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
| 1316 | } | ||
| 1317 | 454 | ||
| 1318 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 455 | break; |
| 1319 | { | ||
| 1320 | db_error(ppdb, query); | ||
| 1321 | } | 456 | } |
| 1322 | |||
| 1323 | sqlite3_finalize(ppstmt); | ||
| 1324 | } | ||
| 1325 | } | ||
| 1326 | } | ||
| 1327 | } | ||
| 1328 | |||
| 1329 | // While we're working on s | ||
| 1330 | { | ||
| 1331 | progress ppgs("Writing word synonyms...", wn.size()); | ||
| 1332 | for (auto sense : wn) | ||
| 1333 | { | ||
| 1334 | ppgs.update(); | ||
| 1335 | 457 | ||
| 1336 | for (auto word1 : sense.second) | 458 | case 'A': |
| 1337 | { | ||
| 1338 | for (auto word2 : sense.second) | ||
| 1339 | { | ||
| 1340 | if (word1 != word2) | ||
| 1341 | { | 459 | { |
| 1342 | std::string query; | 460 | if (forms.size() == 2) |
| 1343 | switch (sense.first / 100000000) | ||
| 1344 | { | 461 | { |
| 1345 | case 1: // Noun | 462 | curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); |
| 1346 | { | 463 | curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); |
| 1347 | query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | 464 | } else { |
| 1348 | 465 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | |
| 1349 | break; | 466 | std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; |
| 1350 | } | 467 | } |
| 1351 | |||
| 1352 | case 2: // Verb | ||
| 1353 | { | ||
| 1354 | // Ignore | ||
| 1355 | |||
| 1356 | break; | ||
| 1357 | } | ||
| 1358 | |||
| 1359 | case 3: // Adjective | ||
| 1360 | { | ||
| 1361 | query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
| 1362 | 468 | ||
| 1363 | break; | 469 | break; |
| 1364 | } | 470 | } |
| 1365 | 471 | ||
| 1366 | case 4: // Adverb | 472 | case 'N': |
| 1367 | { | 473 | { |
| 1368 | query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | 474 | if (forms.size() == 1) |
| 1369 | |||
| 1370 | break; | ||
| 1371 | } | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | sqlite3_stmt* ppstmt; | ||
| 1375 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1376 | { | ||
| 1377 | db_error(ppdb, query); | ||
| 1378 | } | ||
| 1379 | |||
| 1380 | sqlite3_bind_int(ppstmt, 1, word1.second); | ||
| 1381 | sqlite3_bind_int(ppstmt, 2, word2.second); | ||
| 1382 | |||
| 1383 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 1384 | { | 475 | { |
| 1385 | db_error(ppdb, query); | 476 | curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); |
| 477 | } else { | ||
| 478 | // As of AGID 2014.08.11, this is non-existent. | ||
| 479 | std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
| 1386 | } | 480 | } |
| 1387 | 481 | ||
| 1388 | sqlite3_finalize(ppstmt); | 482 | break; |
| 1389 | } | 483 | } |
| 1390 | } | 484 | } |
| 1391 | } | 485 | } |
| 1392 | } | 486 | } |
| 1393 | } | ||
| 1394 | |||
| 1395 | // ant table | ||
| 1396 | { | ||
| 1397 | std::ifstream wnantfile(wnpref + "wn_ant.pl"); | ||
| 1398 | if (!wnantfile.is_open()) | ||
| 1399 | { | ||
| 1400 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1401 | print_usage(); | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | std::list<std::string> lines; | ||
| 1405 | for (;;) | ||
| 1406 | { | ||
| 1407 | std::string line; | ||
| 1408 | if (!getline(wnantfile, line)) | ||
| 1409 | { | ||
| 1410 | break; | ||
| 1411 | } | ||
| 1412 | 487 | ||
| 1413 | if (line.back() == '\r') | 488 | void generator::readPrepositions() |
| 1414 | { | ||
| 1415 | line.pop_back(); | ||
| 1416 | } | ||
| 1417 | |||
| 1418 | lines.push_back(line); | ||
| 1419 | } | ||
| 1420 | |||
| 1421 | progress ppgs("Writing antonyms...", lines.size()); | ||
| 1422 | for (auto line : lines) | ||
| 1423 | { | 489 | { |
| 1424 | ppgs.update(); | 490 | std::list<std::string> lines(readFile("prepositions.txt")); |
| 491 | progress ppgs("Reading prepositions...", lines.size()); | ||
| 1425 | 492 | ||
| 1426 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | 493 | for (std::string line : lines) |
| 1427 | std::smatch relation_data; | ||
| 1428 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1429 | { | ||
| 1430 | continue; | ||
| 1431 | } | ||
| 1432 | |||
| 1433 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1434 | int wnum_1 = stoi(relation_data[2]); | ||
| 1435 | int synset_id_2 = stoi(relation_data[3]); | ||
| 1436 | int wnum_2 = stoi(relation_data[4]); | ||
| 1437 | |||
| 1438 | std::string query; | ||
| 1439 | switch (synset_id_1 / 100000000) | ||
| 1440 | { | 494 | { |
| 1441 | case 1: // Noun | 495 | ppgs.update(); |
| 1442 | { | ||
| 1443 | query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
| 1444 | 496 | ||
| 1445 | break; | 497 | std::regex relation("^([^:]+): (.+)"); |
| 1446 | } | 498 | std::smatch relation_data; |
| 1447 | 499 | std::regex_search(line, relation_data, relation); | |
| 1448 | case 2: // Verb | 500 | std::string prep = relation_data[1]; |
| 1449 | { | 501 | auto groups = split<std::list<std::string>>(relation_data[2], ", "); |
| 1450 | // Ignore | ||
| 1451 | 502 | ||
| 1452 | break; | 503 | notion& n = createNotion(part_of_speech::preposition); |
| 1453 | } | 504 | lemma& l = lookupOrCreateLemma(prep); |
| 1454 | 505 | word& w = createWord(n, l); | |
| 1455 | case 3: // Adjective | ||
| 1456 | { | ||
| 1457 | query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
| 1458 | 506 | ||
| 1459 | break; | 507 | n.setPrepositionGroups(groups); |
| 1460 | } | ||
| 1461 | |||
| 1462 | case 4: // Adverb | ||
| 1463 | { | ||
| 1464 | query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
| 1465 | |||
| 1466 | break; | ||
| 1467 | } | ||
| 1468 | } | ||
| 1469 | |||
| 1470 | sqlite3_stmt* ppstmt; | ||
| 1471 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1472 | { | ||
| 1473 | db_error(ppdb, query); | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
| 1477 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
| 1478 | |||
| 1479 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 1480 | { | ||
| 1481 | db_error(ppdb, query); | ||
| 1482 | } | ||
| 1483 | |||
| 1484 | sqlite3_finalize(ppstmt); | ||
| 1485 | } | ||
| 1486 | } | ||
| 1487 | |||
| 1488 | // at table | ||
| 1489 | { | ||
| 1490 | std::ifstream wnatfile(wnpref + "wn_at.pl"); | ||
| 1491 | if (!wnatfile.is_open()) | ||
| 1492 | { | ||
| 1493 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1494 | print_usage(); | ||
| 1495 | } | ||
| 1496 | |||
| 1497 | std::list<std::string> lines; | ||
| 1498 | for (;;) | ||
| 1499 | { | ||
| 1500 | std::string line; | ||
| 1501 | if (!getline(wnatfile, line)) | ||
| 1502 | { | ||
| 1503 | break; | ||
| 1504 | } | 508 | } |
| 1505 | |||
| 1506 | if (line.back() == '\r') | ||
| 1507 | { | ||
| 1508 | line.pop_back(); | ||
| 1509 | } | ||
| 1510 | |||
| 1511 | lines.push_back(line); | ||
| 1512 | } | 509 | } |
| 1513 | 510 | ||
| 1514 | progress ppgs("Writing variations...", lines.size()); | 511 | void generator::readCmudictPronunciations() |
| 1515 | for (auto line : lines) | ||
| 1516 | { | 512 | { |
| 1517 | ppgs.update(); | 513 | std::list<std::string> lines(readFile(cmudictPath_)); |
| 514 | progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); | ||
| 1518 | 515 | ||
| 1519 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); | 516 | for (std::string line : lines) |
| 1520 | std::smatch relation_data; | ||
| 1521 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1522 | { | 517 | { |
| 1523 | continue; | 518 | ppgs.update(); |
| 1524 | } | 519 | |
| 1525 | 520 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | |
| 1526 | int synset_id_1 = stoi(relation_data[1]); | 521 | std::smatch phoneme_data; |
| 1527 | int synset_id_2 = stoi(relation_data[2]); | 522 | if (std::regex_search(line, phoneme_data, phoneme)) |
| 1528 | std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); | ||
| 1529 | |||
| 1530 | for (auto mapping1 : wn[synset_id_1]) | ||
| 1531 | { | ||
| 1532 | for (auto mapping2 : wn[synset_id_2]) | ||
| 1533 | { | 523 | { |
| 1534 | sqlite3_stmt* ppstmt; | 524 | std::string canonical(phoneme_data[1]); |
| 1535 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 525 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); |
| 1536 | { | ||
| 1537 | db_error(ppdb, query); | ||
| 1538 | } | ||
| 1539 | |||
| 1540 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
| 1541 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
| 1542 | 526 | ||
| 1543 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 527 | if (!formByText_.count(canonical)) |
| 1544 | { | 528 | { |
| 1545 | db_error(ppdb, query); | 529 | continue; |
| 1546 | } | 530 | } |
| 1547 | 531 | ||
| 1548 | sqlite3_finalize(ppstmt); | 532 | std::string phonemes = phoneme_data[2]; |
| 533 | pronunciations_.emplace_back(phonemes); | ||
| 534 | pronunciation& p = pronunciations_.back(); | ||
| 535 | formByText_.at(canonical)->addPronunciation(p); | ||
| 1549 | } | 536 | } |
| 1550 | } | 537 | } |
| 1551 | } | 538 | } |
| 1552 | } | ||
| 1553 | |||
| 1554 | // der table | ||
| 1555 | { | ||
| 1556 | std::ifstream wnderfile(wnpref + "wn_der.pl"); | ||
| 1557 | if (!wnderfile.is_open()) | ||
| 1558 | { | ||
| 1559 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1560 | print_usage(); | ||
| 1561 | } | ||
| 1562 | 539 | ||
| 1563 | std::list<std::string> lines; | 540 | void generator::writeSchema() |
| 1564 | for (;;) | ||
| 1565 | { | 541 | { |
| 1566 | std::string line; | 542 | std::ifstream file("schema.sql"); |
| 1567 | if (!getline(wnderfile, line)) | 543 | if (!file) |
| 1568 | { | 544 | { |
| 1569 | break; | 545 | throw std::invalid_argument("Could not find database schema"); |
| 1570 | } | 546 | } |
| 1571 | 547 | ||
| 1572 | if (line.back() == '\r') | 548 | std::ostringstream schemaBuilder; |
| 549 | std::string line; | ||
| 550 | while (std::getline(file, line)) | ||
| 1573 | { | 551 | { |
| 1574 | line.pop_back(); | 552 | if (line.back() == '\r') |
| 553 | { | ||
| 554 | line.pop_back(); | ||
| 555 | } | ||
| 556 | |||
| 557 | schemaBuilder << line; | ||
| 1575 | } | 558 | } |
| 1576 | 559 | ||
| 1577 | lines.push_back(line); | 560 | std::string schema = schemaBuilder.str(); |
| 561 | auto queries = split<std::list<std::string>>(schema, ";"); | ||
| 562 | progress ppgs("Writing database schema...", queries.size()); | ||
| 563 | for (std::string query : queries) | ||
| 564 | { | ||
| 565 | if (!queries.empty()) | ||
| 566 | { | ||
| 567 | db_.runQuery(query); | ||
| 568 | } | ||
| 569 | |||
| 570 | ppgs.update(); | ||
| 571 | } | ||
| 1578 | } | 572 | } |
| 1579 | 573 | ||
| 1580 | progress ppgs("Writing morphological derivation...", lines.size()); | 574 | void generator::dumpObjects() |
| 1581 | for (auto line : lines) | ||
| 1582 | { | 575 | { |
| 1583 | ppgs.update(); | ||
| 1584 | |||
| 1585 | std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
| 1586 | std::smatch relation_data; | ||
| 1587 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1588 | { | 576 | { |
| 1589 | continue; | 577 | progress ppgs("Writing notions...", notions_.size()); |
| 578 | |||
| 579 | for (notion& n : notions_) | ||
| 580 | { | ||
| 581 | db_ << n; | ||
| 582 | |||
| 583 | ppgs.update(); | ||
| 584 | } | ||
| 1590 | } | 585 | } |
| 1591 | 586 | ||
| 1592 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1593 | int wnum_1 = stoi(relation_data[2]); | ||
| 1594 | int synset_id_2 = stoi(relation_data[3]); | ||
| 1595 | int wnum_2 = stoi(relation_data[4]); | ||
| 1596 | std::string query; | ||
| 1597 | switch (synset_id_1 / 100000000) | ||
| 1598 | { | 587 | { |
| 1599 | case 1: // Noun | 588 | progress ppgs("Writing words...", words_.size()); |
| 589 | |||
| 590 | for (word& w : words_) | ||
| 1600 | { | 591 | { |
| 1601 | switch (synset_id_2 / 100000000) | 592 | db_ << w; |
| 1602 | { | ||
| 1603 | case 1: // Noun | ||
| 1604 | { | ||
| 1605 | query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
| 1606 | break; | ||
| 1607 | } | ||
| 1608 | |||
| 1609 | case 3: // Adjective | ||
| 1610 | { | ||
| 1611 | query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; | ||
| 1612 | break; | ||
| 1613 | } | ||
| 1614 | |||
| 1615 | case 4: // Adverb | ||
| 1616 | { | ||
| 1617 | query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; | ||
| 1618 | break; | ||
| 1619 | } | ||
| 1620 | } | ||
| 1621 | 593 | ||
| 1622 | break; | 594 | ppgs.update(); |
| 1623 | } | 595 | } |
| 596 | } | ||
| 597 | |||
| 598 | { | ||
| 599 | progress ppgs("Writing lemmas...", lemmas_.size()); | ||
| 1624 | 600 | ||
| 1625 | case 3: // Adjective | 601 | for (lemma& l : lemmas_) |
| 1626 | { | 602 | { |
| 1627 | switch (synset_id_2 / 100000000) | 603 | db_ << l; |
| 1628 | { | ||
| 1629 | case 1: // Noun | ||
| 1630 | { | ||
| 1631 | query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; | ||
| 1632 | break; | ||
| 1633 | } | ||
| 1634 | |||
| 1635 | case 3: // Adjective | ||
| 1636 | { | ||
| 1637 | query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; | ||
| 1638 | break; | ||
| 1639 | } | ||
| 1640 | |||
| 1641 | case 4: // Adverb | ||
| 1642 | { | ||
| 1643 | query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; | ||
| 1644 | break; | ||
| 1645 | } | ||
| 1646 | } | ||
| 1647 | 604 | ||
| 1648 | break; | 605 | ppgs.update(); |
| 1649 | } | 606 | } |
| 607 | } | ||
| 608 | |||
| 609 | { | ||
| 610 | progress ppgs("Writing forms...", forms_.size()); | ||
| 1650 | 611 | ||
| 1651 | case 4: // Adverb | 612 | for (form& f : forms_) |
| 1652 | { | 613 | { |
| 1653 | switch (synset_id_2 / 100000000) | 614 | db_ << f; |
| 1654 | { | ||
| 1655 | case 1: // Noun | ||
| 1656 | { | ||
| 1657 | query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; | ||
| 1658 | break; | ||
| 1659 | } | ||
| 1660 | |||
| 1661 | case 3: // Adjective | ||
| 1662 | { | ||
| 1663 | query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; | ||
| 1664 | break; | ||
| 1665 | } | ||
| 1666 | |||
| 1667 | case 4: // Adverb | ||
| 1668 | { | ||
| 1669 | query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
| 1670 | break; | ||
| 1671 | } | ||
| 1672 | } | ||
| 1673 | 615 | ||
| 1674 | break; | 616 | ppgs.update(); |
| 1675 | } | 617 | } |
| 1676 | } | 618 | } |
| 1677 | 619 | ||
| 1678 | sqlite3_stmt* ppstmt; | ||
| 1679 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
| 1680 | { | 620 | { |
| 1681 | db_error(ppdb, query); | 621 | progress ppgs("Writing pronunciations...", pronunciations_.size()); |
| 622 | |||
| 623 | for (pronunciation& p : pronunciations_) | ||
| 624 | { | ||
| 625 | db_ << p; | ||
| 626 | |||
| 627 | ppgs.update(); | ||
| 628 | } | ||
| 1682 | } | 629 | } |
| 1683 | 630 | ||
| 1684 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
| 1685 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
| 1686 | |||
| 1687 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 1688 | { | 631 | { |
| 1689 | db_error(ppdb, query); | 632 | progress ppgs("Writing verb groups...", groups_.size()); |
| 633 | |||
| 634 | for (group& g : groups_) | ||
| 635 | { | ||
| 636 | db_ << g; | ||
| 637 | |||
| 638 | ppgs.update(); | ||
| 639 | } | ||
| 1690 | } | 640 | } |
| 1691 | 641 | ||
| 1692 | sqlite3_finalize(ppstmt); | ||
| 1693 | } | ||
| 1694 | } | ||
| 1695 | |||
| 1696 | // hyp table | ||
| 1697 | { | ||
| 1698 | std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); | ||
| 1699 | if (!wnhypfile.is_open()) | ||
| 1700 | { | ||
| 1701 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1702 | print_usage(); | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | std::list<std::string> lines; | ||
| 1706 | for (;;) | ||
| 1707 | { | ||
| 1708 | std::string line; | ||
| 1709 | if (!getline(wnhypfile, line)) | ||
| 1710 | { | ||
| 1711 | break; | ||
| 1712 | } | ||
| 1713 | |||
| 1714 | if (line.back() == '\r') | ||
| 1715 | { | 642 | { |
| 1716 | line.pop_back(); | 643 | progress ppgs("Writing verb frames...", frames_.size()); |
| 644 | |||
| 645 | for (frame& f : frames_) | ||
| 646 | { | ||
| 647 | db_ << f; | ||
| 648 | |||
| 649 | ppgs.update(); | ||
| 650 | } | ||
| 1717 | } | 651 | } |
| 1718 | |||
| 1719 | lines.push_back(line); | ||
| 1720 | } | 652 | } |
| 1721 | 653 | ||
| 1722 | progress ppgs("Writing hypernyms...", lines.size()); | 654 | void generator::readWordNetAntonymy() |
| 1723 | for (auto line : lines) | ||
| 1724 | { | 655 | { |
| 1725 | ppgs.update(); | 656 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); |
| 1726 | 657 | progress ppgs("Writing antonyms...", lines.size()); | |
| 1727 | std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); | 658 | for (auto line : lines) |
| 1728 | std::smatch relation_data; | ||
| 1729 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1730 | { | 659 | { |
| 1731 | continue; | 660 | ppgs.update(); |
| 1732 | } | ||
| 1733 | |||
| 1734 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1735 | int synset_id_2 = stoi(relation_data[2]); | ||
| 1736 | std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); | ||
| 1737 | 661 | ||
| 1738 | for (auto mapping1 : wn[synset_id_1]) | 662 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); |
| 1739 | { | 663 | std::smatch relation_data; |
| 1740 | for (auto mapping2 : wn[synset_id_2]) | 664 | if (!std::regex_search(line, relation_data, relation)) |
| 1741 | { | 665 | { |
| 1742 | sqlite3_stmt* ppstmt; | 666 | continue; |
| 1743 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 667 | } |
| 1744 | { | 668 | |
| 1745 | db_error(ppdb, query); | 669 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
| 1746 | } | 670 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
| 1747 | 671 | ||
| 1748 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 672 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) |
| 1749 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 673 | { |
| 674 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
| 675 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
| 1750 | 676 | ||
| 1751 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 677 | std::list<field> fields; |
| 1752 | { | 678 | fields.emplace_back("antonym_1_id", word1.getId()); |
| 1753 | db_error(ppdb, query); | 679 | fields.emplace_back("antonym_2_id", word2.getId()); |
| 1754 | } | ||
| 1755 | 680 | ||
| 1756 | sqlite3_finalize(ppstmt); | 681 | db_.insertIntoTable("antonymy", std::move(fields)); |
| 1757 | } | 682 | } |
| 1758 | } | 683 | } |
| 1759 | } | 684 | } |
| 1760 | } | ||
| 1761 | |||
| 1762 | // ins table | ||
| 1763 | { | ||
| 1764 | std::ifstream wninsfile(wnpref + "wn_ins.pl"); | ||
| 1765 | if (!wninsfile.is_open()) | ||
| 1766 | { | ||
| 1767 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1768 | print_usage(); | ||
| 1769 | } | ||
| 1770 | |||
| 1771 | std::list<std::string> lines; | ||
| 1772 | for (;;) | ||
| 1773 | { | ||
| 1774 | std::string line; | ||
| 1775 | if (!getline(wninsfile, line)) | ||
| 1776 | { | ||
| 1777 | break; | ||
| 1778 | } | ||
| 1779 | 685 | ||
| 1780 | if (line.back() == '\r') | 686 | void generator::readWordNetVariation() |
| 687 | { | ||
| 688 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); | ||
| 689 | progress ppgs("Writing variation...", lines.size()); | ||
| 690 | for (auto line : lines) | ||
| 1781 | { | 691 | { |
| 1782 | line.pop_back(); | 692 | ppgs.update(); |
| 1783 | } | ||
| 1784 | 693 | ||
| 1785 | lines.push_back(line); | 694 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); |
| 695 | std::smatch relation_data; | ||
| 696 | if (!std::regex_search(line, relation_data, relation)) | ||
| 697 | { | ||
| 698 | continue; | ||
| 699 | } | ||
| 700 | |||
| 701 | int lookup1 = std::stoi(relation_data[1]); | ||
| 702 | int lookup2 = std::stoi(relation_data[2]); | ||
| 703 | |||
| 704 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 705 | { | ||
| 706 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 707 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 708 | |||
| 709 | std::list<field> fields; | ||
| 710 | fields.emplace_back("noun_id", notion1.getId()); | ||
| 711 | fields.emplace_back("adjective_id", notion2.getId()); | ||
| 712 | |||
| 713 | db_.insertIntoTable("variation", std::move(fields)); | ||
| 714 | } | ||
| 715 | } | ||
| 1786 | } | 716 | } |
| 1787 | 717 | ||
| 1788 | progress ppgs("Writing instantiations...", lines.size()); | 718 | void generator::readWordNetClasses() |
| 1789 | for (auto line : lines) | ||
| 1790 | { | 719 | { |
| 1791 | ppgs.update(); | 720 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); |
| 1792 | 721 | progress ppgs("Writing usage, topicality, and regionality...", lines.size()); | |
| 1793 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); | 722 | for (auto line : lines) |
| 1794 | std::smatch relation_data; | ||
| 1795 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1796 | { | 723 | { |
| 1797 | continue; | 724 | ppgs.update(); |
| 1798 | } | ||
| 1799 | |||
| 1800 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1801 | int synset_id_2 = stoi(relation_data[2]); | ||
| 1802 | std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); | ||
| 1803 | 725 | ||
| 1804 | for (auto mapping1 : wn[synset_id_1]) | 726 | std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); |
| 1805 | { | 727 | std::smatch relation_data; |
| 1806 | for (auto mapping2 : wn[synset_id_2]) | 728 | if (!std::regex_search(line, relation_data, relation)) |
| 729 | { | ||
| 730 | continue; | ||
| 731 | } | ||
| 732 | |||
| 733 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
| 734 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
| 735 | std::string class_type = relation_data[5]; | ||
| 736 | |||
| 737 | std::string table_name; | ||
| 738 | if (class_type == "t") | ||
| 739 | { | ||
| 740 | table_name += "topicality"; | ||
| 741 | } else if (class_type == "u") | ||
| 742 | { | ||
| 743 | table_name += "usage"; | ||
| 744 | } else if (class_type == "r") | ||
| 745 | { | ||
| 746 | table_name += "regionality"; | ||
| 747 | } | ||
| 748 | |||
| 749 | std::list<int> leftJoin; | ||
| 750 | std::list<int> rightJoin; | ||
| 751 | |||
| 752 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) | ||
| 1807 | { | 753 | { |
| 1808 | sqlite3_stmt* ppstmt; | 754 | std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { |
| 1809 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 755 | return w->getId(); |
| 756 | }); | ||
| 757 | } else if (wordByWnidAndWnum_.count(lookup1)) { | ||
| 758 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); | ||
| 759 | } | ||
| 760 | |||
| 761 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) | ||
| 762 | { | ||
| 763 | std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { | ||
| 764 | return w->getId(); | ||
| 765 | }); | ||
| 766 | } else if (wordByWnidAndWnum_.count(lookup2)) { | ||
| 767 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); | ||
| 768 | } | ||
| 769 | |||
| 770 | for (int word1 : leftJoin) | ||
| 771 | { | ||
| 772 | for (int word2 : rightJoin) | ||
| 1810 | { | 773 | { |
| 1811 | db_error(ppdb, query); | 774 | std::list<field> fields; |
| 1812 | } | 775 | fields.emplace_back("term_id", word1); |
| 776 | fields.emplace_back("domain_id", word2); | ||
| 1813 | 777 | ||
| 1814 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 778 | db_.insertIntoTable(table_name, std::move(fields)); |
| 1815 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
| 1816 | |||
| 1817 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 1818 | { | ||
| 1819 | db_error(ppdb, query); | ||
| 1820 | } | 779 | } |
| 1821 | |||
| 1822 | sqlite3_finalize(ppstmt); | ||
| 1823 | } | 780 | } |
| 1824 | } | 781 | } |
| 1825 | } | 782 | } |
| 1826 | } | ||
| 1827 | |||
| 1828 | // mm table | ||
| 1829 | { | ||
| 1830 | std::ifstream wnmmfile(wnpref + "wn_mm.pl"); | ||
| 1831 | if (!wnmmfile.is_open()) | ||
| 1832 | { | ||
| 1833 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1834 | print_usage(); | ||
| 1835 | } | ||
| 1836 | |||
| 1837 | std::list<std::string> lines; | ||
| 1838 | for (;;) | ||
| 1839 | { | ||
| 1840 | std::string line; | ||
| 1841 | if (!getline(wnmmfile, line)) | ||
| 1842 | { | ||
| 1843 | break; | ||
| 1844 | } | ||
| 1845 | 783 | ||
| 1846 | if (line.back() == '\r') | 784 | void generator::readWordNetCausality() |
| 785 | { | ||
| 786 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); | ||
| 787 | progress ppgs("Writing causality...", lines.size()); | ||
| 788 | for (auto line : lines) | ||
| 1847 | { | 789 | { |
| 1848 | line.pop_back(); | 790 | ppgs.update(); |
| 1849 | } | ||
| 1850 | 791 | ||
| 1851 | lines.push_back(line); | 792 | std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); |
| 793 | std::smatch relation_data; | ||
| 794 | if (!std::regex_search(line, relation_data, relation)) | ||
| 795 | { | ||
| 796 | continue; | ||
| 797 | } | ||
| 798 | |||
| 799 | int lookup1 = std::stoi(relation_data[1]); | ||
| 800 | int lookup2 = std::stoi(relation_data[2]); | ||
| 801 | |||
| 802 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 803 | { | ||
| 804 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 805 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 806 | |||
| 807 | std::list<field> fields; | ||
| 808 | fields.emplace_back("effect_id", notion1.getId()); | ||
| 809 | fields.emplace_back("cause_id", notion2.getId()); | ||
| 810 | |||
| 811 | db_.insertIntoTable("causality", std::move(fields)); | ||
| 812 | } | ||
| 813 | } | ||
| 1852 | } | 814 | } |
| 1853 | 815 | ||
| 1854 | progress ppgs("Writing member meronyms...", lines.size()); | 816 | void generator::readWordNetEntailment() |
| 1855 | for (auto line : lines) | ||
| 1856 | { | 817 | { |
| 1857 | ppgs.update(); | 818 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); |
| 1858 | 819 | progress ppgs("Writing entailment...", lines.size()); | |
| 1859 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); | 820 | for (auto line : lines) |
| 1860 | std::smatch relation_data; | ||
| 1861 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1862 | { | 821 | { |
| 1863 | continue; | 822 | ppgs.update(); |
| 1864 | } | ||
| 1865 | 823 | ||
| 1866 | int synset_id_1 = stoi(relation_data[1]); | 824 | std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); |
| 1867 | int synset_id_2 = stoi(relation_data[2]); | 825 | std::smatch relation_data; |
| 1868 | std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | 826 | if (!std::regex_search(line, relation_data, relation)) |
| 1869 | |||
| 1870 | for (auto mapping1 : wn[synset_id_1]) | ||
| 1871 | { | ||
| 1872 | for (auto mapping2 : wn[synset_id_2]) | ||
| 1873 | { | 827 | { |
| 1874 | sqlite3_stmt* ppstmt; | 828 | continue; |
| 1875 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 829 | } |
| 1876 | { | 830 | |
| 1877 | db_error(ppdb, query); | 831 | int lookup1 = std::stoi(relation_data[1]); |
| 1878 | } | 832 | int lookup2 = std::stoi(relation_data[2]); |
| 1879 | 833 | ||
| 1880 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 834 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
| 1881 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 835 | { |
| 836 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 837 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 1882 | 838 | ||
| 1883 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 839 | std::list<field> fields; |
| 1884 | { | 840 | fields.emplace_back("given_id", notion1.getId()); |
| 1885 | db_error(ppdb, query); | 841 | fields.emplace_back("entailment_id", notion2.getId()); |
| 1886 | } | ||
| 1887 | 842 | ||
| 1888 | sqlite3_finalize(ppstmt); | 843 | db_.insertIntoTable("entailment", std::move(fields)); |
| 1889 | } | 844 | } |
| 1890 | } | 845 | } |
| 1891 | } | 846 | } |
| 1892 | } | 847 | |
| 1893 | 848 | void generator::readWordNetHypernymy() | |
| 1894 | // ms table | ||
| 1895 | { | ||
| 1896 | std::ifstream wnmsfile(wnpref + "wn_ms.pl"); | ||
| 1897 | if (!wnmsfile.is_open()) | ||
| 1898 | { | ||
| 1899 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1900 | print_usage(); | ||
| 1901 | } | ||
| 1902 | |||
| 1903 | std::list<std::string> lines; | ||
| 1904 | for (;;) | ||
| 1905 | { | 849 | { |
| 1906 | std::string line; | 850 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); |
| 1907 | if (!getline(wnmsfile, line)) | 851 | progress ppgs("Writing hypernymy...", lines.size()); |
| 852 | for (auto line : lines) | ||
| 1908 | { | 853 | { |
| 1909 | break; | 854 | ppgs.update(); |
| 855 | |||
| 856 | std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); | ||
| 857 | std::smatch relation_data; | ||
| 858 | if (!std::regex_search(line, relation_data, relation)) | ||
| 859 | { | ||
| 860 | continue; | ||
| 861 | } | ||
| 862 | |||
| 863 | int lookup1 = std::stoi(relation_data[1]); | ||
| 864 | int lookup2 = std::stoi(relation_data[2]); | ||
| 865 | |||
| 866 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 867 | { | ||
| 868 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 869 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 870 | |||
| 871 | std::list<field> fields; | ||
| 872 | fields.emplace_back("hyponym_id", notion1.getId()); | ||
| 873 | fields.emplace_back("hypernym_id", notion2.getId()); | ||
| 874 | |||
| 875 | db_.insertIntoTable("hypernymy", std::move(fields)); | ||
| 876 | } | ||
| 1910 | } | 877 | } |
| 878 | } | ||
| 1911 | 879 | ||
| 1912 | if (line.back() == '\r') | 880 | void generator::readWordNetInstantiation() |
| 881 | { | ||
| 882 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); | ||
| 883 | progress ppgs("Writing instantiation...", lines.size()); | ||
| 884 | for (auto line : lines) | ||
| 1913 | { | 885 | { |
| 1914 | line.pop_back(); | 886 | ppgs.update(); |
| 1915 | } | ||
| 1916 | 887 | ||
| 1917 | lines.push_back(line); | 888 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); |
| 889 | std::smatch relation_data; | ||
| 890 | if (!std::regex_search(line, relation_data, relation)) | ||
| 891 | { | ||
| 892 | continue; | ||
| 893 | } | ||
| 894 | |||
| 895 | int lookup1 = std::stoi(relation_data[1]); | ||
| 896 | int lookup2 = std::stoi(relation_data[2]); | ||
| 897 | |||
| 898 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 899 | { | ||
| 900 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 901 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 902 | |||
| 903 | std::list<field> fields; | ||
| 904 | fields.emplace_back("instance_id", notion1.getId()); | ||
| 905 | fields.emplace_back("class_id", notion2.getId()); | ||
| 906 | |||
| 907 | db_.insertIntoTable("instantiation", std::move(fields)); | ||
| 908 | } | ||
| 909 | } | ||
| 1918 | } | 910 | } |
| 1919 | 911 | ||
| 1920 | progress ppgs("Writing substance meronyms...", lines.size()); | 912 | void generator::readWordNetMemberMeronymy() |
| 1921 | for (auto line : lines) | ||
| 1922 | { | 913 | { |
| 1923 | ppgs.update(); | 914 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); |
| 1924 | 915 | progress ppgs("Writing member meronymy...", lines.size()); | |
| 1925 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); | 916 | for (auto line : lines) |
| 1926 | std::smatch relation_data; | ||
| 1927 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1928 | { | 917 | { |
| 1929 | continue; | 918 | ppgs.update(); |
| 1930 | } | ||
| 1931 | |||
| 1932 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1933 | int synset_id_2 = stoi(relation_data[2]); | ||
| 1934 | std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
| 1935 | 919 | ||
| 1936 | for (auto mapping1 : wn[synset_id_1]) | 920 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); |
| 1937 | { | 921 | std::smatch relation_data; |
| 1938 | for (auto mapping2 : wn[synset_id_2]) | 922 | if (!std::regex_search(line, relation_data, relation)) |
| 1939 | { | 923 | { |
| 1940 | sqlite3_stmt* ppstmt; | 924 | continue; |
| 1941 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 925 | } |
| 1942 | { | 926 | |
| 1943 | db_error(ppdb, query); | 927 | int lookup1 = std::stoi(relation_data[1]); |
| 1944 | } | 928 | int lookup2 = std::stoi(relation_data[2]); |
| 929 | |||
| 930 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 931 | { | ||
| 932 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 933 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 1945 | 934 | ||
| 1946 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 935 | std::list<field> fields; |
| 1947 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 936 | fields.emplace_back("holonym_id", notion1.getId()); |
| 937 | fields.emplace_back("meronym_id", notion2.getId()); | ||
| 1948 | 938 | ||
| 1949 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 939 | db_.insertIntoTable("member_meronymy", std::move(fields)); |
| 1950 | { | ||
| 1951 | db_error(ppdb, query); | ||
| 1952 | } | ||
| 1953 | |||
| 1954 | sqlite3_finalize(ppstmt); | ||
| 1955 | } | 940 | } |
| 1956 | } | 941 | } |
| 1957 | } | 942 | } |
| 1958 | } | 943 | |
| 1959 | 944 | void generator::readWordNetPartMeronymy() | |
| 1960 | // mm table | ||
| 1961 | { | ||
| 1962 | std::ifstream wnmpfile(wnpref + "wn_mp.pl"); | ||
| 1963 | if (!wnmpfile.is_open()) | ||
| 1964 | { | ||
| 1965 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 1966 | print_usage(); | ||
| 1967 | } | ||
| 1968 | |||
| 1969 | std::list<std::string> lines; | ||
| 1970 | for (;;) | ||
| 1971 | { | 945 | { |
| 1972 | std::string line; | 946 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); |
| 1973 | if (!getline(wnmpfile, line)) | 947 | progress ppgs("Writing part meronymy...", lines.size()); |
| 948 | for (auto line : lines) | ||
| 1974 | { | 949 | { |
| 1975 | break; | 950 | ppgs.update(); |
| 951 | |||
| 952 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
| 953 | std::smatch relation_data; | ||
| 954 | if (!std::regex_search(line, relation_data, relation)) | ||
| 955 | { | ||
| 956 | continue; | ||
| 957 | } | ||
| 958 | |||
| 959 | int lookup1 = std::stoi(relation_data[1]); | ||
| 960 | int lookup2 = std::stoi(relation_data[2]); | ||
| 961 | |||
| 962 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 963 | { | ||
| 964 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 965 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 966 | |||
| 967 | std::list<field> fields; | ||
| 968 | fields.emplace_back("holonym_id", notion1.getId()); | ||
| 969 | fields.emplace_back("meronym_id", notion2.getId()); | ||
| 970 | |||
| 971 | db_.insertIntoTable("part_meronymy", std::move(fields)); | ||
| 972 | } | ||
| 1976 | } | 973 | } |
| 974 | } | ||
| 1977 | 975 | ||
| 1978 | if (line.back() == '\r') | 976 | void generator::readWordNetSubstanceMeronymy() |
| 977 | { | ||
| 978 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); | ||
| 979 | progress ppgs("Writing substance meronymy...", lines.size()); | ||
| 980 | for (auto line : lines) | ||
| 1979 | { | 981 | { |
| 1980 | line.pop_back(); | 982 | ppgs.update(); |
| 1981 | } | ||
| 1982 | 983 | ||
| 1983 | lines.push_back(line); | 984 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); |
| 985 | std::smatch relation_data; | ||
| 986 | if (!std::regex_search(line, relation_data, relation)) | ||
| 987 | { | ||
| 988 | continue; | ||
| 989 | } | ||
| 990 | |||
| 991 | int lookup1 = std::stoi(relation_data[1]); | ||
| 992 | int lookup2 = std::stoi(relation_data[2]); | ||
| 993 | |||
| 994 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
| 995 | { | ||
| 996 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
| 997 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
| 998 | |||
| 999 | std::list<field> fields; | ||
| 1000 | fields.emplace_back("holonym_id", notion1.getId()); | ||
| 1001 | fields.emplace_back("meronym_id", notion2.getId()); | ||
| 1002 | |||
| 1003 | db_.insertIntoTable("substance_meronymy", std::move(fields)); | ||
| 1004 | } | ||
| 1005 | } | ||
| 1984 | } | 1006 | } |
| 1985 | 1007 | ||
| 1986 | progress ppgs("Writing part meronyms...", lines.size()); | 1008 | void generator::readWordNetPertainymy() |
| 1987 | for (auto line : lines) | ||
| 1988 | { | 1009 | { |
| 1989 | ppgs.update(); | 1010 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); |
| 1990 | 1011 | progress ppgs("Writing pertainymy and mannernymy...", lines.size()); | |
| 1991 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | 1012 | for (auto line : lines) |
| 1992 | std::smatch relation_data; | ||
| 1993 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1994 | { | 1013 | { |
| 1995 | continue; | 1014 | ppgs.update(); |
| 1996 | } | ||
| 1997 | |||
| 1998 | int synset_id_1 = stoi(relation_data[1]); | ||
| 1999 | int synset_id_2 = stoi(relation_data[2]); | ||
| 2000 | std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
| 2001 | 1015 | ||
| 2002 | for (auto mapping1 : wn[synset_id_1]) | 1016 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); |
| 2003 | { | 1017 | std::smatch relation_data; |
| 2004 | for (auto mapping2 : wn[synset_id_2]) | 1018 | if (!std::regex_search(line, relation_data, relation)) |
| 2005 | { | 1019 | { |
| 2006 | sqlite3_stmt* ppstmt; | 1020 | continue; |
| 2007 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 1021 | } |
| 2008 | { | 1022 | |
| 2009 | db_error(ppdb, query); | 1023 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
| 2010 | } | 1024 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
| 1025 | |||
| 1026 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
| 1027 | { | ||
| 1028 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
| 1029 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
| 2011 | 1030 | ||
| 2012 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1031 | if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) |
| 2013 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1032 | { |
| 1033 | std::list<field> fields; | ||
| 1034 | fields.emplace_back("pertainym_id", word1.getId()); | ||
| 1035 | fields.emplace_back("noun_id", word2.getId()); | ||
| 2014 | 1036 | ||
| 2015 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1037 | db_.insertIntoTable("pertainymy", std::move(fields)); |
| 1038 | } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) | ||
| 2016 | { | 1039 | { |
| 2017 | db_error(ppdb, query); | 1040 | std::list<field> fields; |
| 2018 | } | 1041 | fields.emplace_back("mannernym_id", word1.getId()); |
| 1042 | fields.emplace_back("adjective_id", word2.getId()); | ||
| 2019 | 1043 | ||
| 2020 | sqlite3_finalize(ppstmt); | 1044 | db_.insertIntoTable("mannernymy", std::move(fields)); |
| 1045 | } | ||
| 2021 | } | 1046 | } |
| 2022 | } | 1047 | } |
| 2023 | } | 1048 | } |
| 2024 | } | ||
| 2025 | |||
| 2026 | // per table | ||
| 2027 | { | ||
| 2028 | std::ifstream wnperfile(wnpref + "wn_per.pl"); | ||
| 2029 | if (!wnperfile.is_open()) | ||
| 2030 | { | ||
| 2031 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 2032 | print_usage(); | ||
| 2033 | } | ||
| 2034 | |||
| 2035 | std::list<std::string> lines; | ||
| 2036 | for (;;) | ||
| 2037 | { | ||
| 2038 | std::string line; | ||
| 2039 | if (!getline(wnperfile, line)) | ||
| 2040 | { | ||
| 2041 | break; | ||
| 2042 | } | ||
| 2043 | 1049 | ||
| 2044 | if (line.back() == '\r') | 1050 | void generator::readWordNetSpecification() |
| 1051 | { | ||
| 1052 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); | ||
| 1053 | progress ppgs("Writing specifications...", lines.size()); | ||
| 1054 | for (auto line : lines) | ||
| 2045 | { | 1055 | { |
| 2046 | line.pop_back(); | 1056 | ppgs.update(); |
| 1057 | |||
| 1058 | std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); | ||
| 1059 | std::smatch relation_data; | ||
| 1060 | if (!std::regex_search(line, relation_data, relation)) | ||
| 1061 | { | ||
| 1062 | continue; | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
| 1066 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
| 1067 | |||
| 1068 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
| 1069 | { | ||
| 1070 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
| 1071 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
| 1072 | |||
| 1073 | std::list<field> fields; | ||
| 1074 | fields.emplace_back("general_id", word1.getId()); | ||
| 1075 | fields.emplace_back("specific_id", word2.getId()); | ||
| 1076 | |||
| 1077 | db_.insertIntoTable("specification", std::move(fields)); | ||
| 1078 | } | ||
| 2047 | } | 1079 | } |
| 2048 | |||
| 2049 | lines.push_back(line); | ||
| 2050 | } | 1080 | } |
| 2051 | 1081 | ||
| 2052 | progress ppgs("Writing pertainyms and mannernyms...", lines.size()); | 1082 | void generator::readWordNetSimilarity() |
| 2053 | for (auto line : lines) | ||
| 2054 | { | 1083 | { |
| 2055 | ppgs.update(); | 1084 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); |
| 2056 | 1085 | progress ppgs("Writing adjective similarity...", lines.size()); | |
| 2057 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | 1086 | for (auto line : lines) |
| 2058 | std::smatch relation_data; | ||
| 2059 | if (!std::regex_search(line, relation_data, relation)) | ||
| 2060 | { | 1087 | { |
| 2061 | continue; | 1088 | ppgs.update(); |
| 2062 | } | ||
| 2063 | 1089 | ||
| 2064 | int synset_id_1 = stoi(relation_data[1]); | 1090 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); |
| 2065 | int wnum_1 = stoi(relation_data[2]); | 1091 | std::smatch relation_data; |
| 2066 | int synset_id_2 = stoi(relation_data[3]); | 1092 | if (!std::regex_search(line, relation_data, relation)) |
| 2067 | int wnum_2 = stoi(relation_data[4]); | ||
| 2068 | std::string query; | ||
| 2069 | switch (synset_id_1 / 100000000) | ||
| 2070 | { | ||
| 2071 | case 3: // Adjective | ||
| 2072 | { | 1093 | { |
| 2073 | // This is a pertainym, the second word should be a noun | 1094 | continue; |
| 2074 | // Technically it can be an adjective but we're ignoring that | ||
| 2075 | if (synset_id_2 / 100000000 != 1) | ||
| 2076 | { | ||
| 2077 | continue; | ||
| 2078 | } | ||
| 2079 | |||
| 2080 | query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; | ||
| 2081 | |||
| 2082 | break; | ||
| 2083 | } | 1095 | } |
| 1096 | |||
| 1097 | int lookup1 = std::stoi(relation_data[1]); | ||
| 1098 | int lookup2 = std::stoi(relation_data[2]); | ||
| 2084 | 1099 | ||
| 2085 | case 4: // Adverb | 1100 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
| 2086 | { | 1101 | { |
| 2087 | // This is a mannernym, the second word should be an adjective | 1102 | notion& notion1 = *notionByWnid_.at(lookup1); |
| 2088 | if (synset_id_2 / 100000000 != 3) | 1103 | notion& notion2 = *notionByWnid_.at(lookup2); |
| 2089 | { | ||
| 2090 | continue; | ||
| 2091 | } | ||
| 2092 | 1104 | ||
| 2093 | query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; | 1105 | std::list<field> fields; |
| 1106 | fields.emplace_back("adjective_1_id", notion1.getId()); | ||
| 1107 | fields.emplace_back("adjective_2_id", notion2.getId()); | ||
| 2094 | 1108 | ||
| 2095 | break; | 1109 | db_.insertIntoTable("similarity", std::move(fields)); |
| 2096 | } | 1110 | } |
| 2097 | } | 1111 | } |
| 2098 | 1112 | } | |
| 2099 | sqlite3_stmt* ppstmt; | ||
| 2100 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 2101 | { | ||
| 2102 | db_error(ppdb, query); | ||
| 2103 | } | ||
| 2104 | |||
| 2105 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
| 2106 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
| 2107 | 1113 | ||
| 2108 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1114 | std::list<std::string> generator::readFile(std::string path) |
| 1115 | { | ||
| 1116 | std::ifstream file(path); | ||
| 1117 | if (!file) | ||
| 2109 | { | 1118 | { |
| 2110 | db_error(ppdb, query); | 1119 | throw std::invalid_argument("Could not find file " + path); |
| 2111 | } | 1120 | } |
| 2112 | |||
| 2113 | sqlite3_finalize(ppstmt); | ||
| 2114 | } | ||
| 2115 | } | ||
| 2116 | 1121 | ||
| 2117 | // sa table | 1122 | std::list<std::string> lines; |
| 2118 | { | ||
| 2119 | std::ifstream wnsafile(wnpref + "wn_sa.pl"); | ||
| 2120 | if (!wnsafile.is_open()) | ||
| 2121 | { | ||
| 2122 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 2123 | print_usage(); | ||
| 2124 | } | ||
| 2125 | |||
| 2126 | std::list<std::string> lines; | ||
| 2127 | for (;;) | ||
| 2128 | { | ||
| 2129 | std::string line; | 1123 | std::string line; |
| 2130 | if (!getline(wnsafile, line)) | 1124 | while (std::getline(file, line)) |
| 2131 | { | ||
| 2132 | break; | ||
| 2133 | } | ||
| 2134 | |||
| 2135 | if (line.back() == '\r') | ||
| 2136 | { | 1125 | { |
| 2137 | line.pop_back(); | 1126 | if (line.back() == '\r') |
| 1127 | { | ||
| 1128 | line.pop_back(); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | lines.push_back(line); | ||
| 2138 | } | 1132 | } |
| 2139 | 1133 | ||
| 2140 | lines.push_back(line); | 1134 | return lines; |
| 2141 | } | 1135 | } |
| 2142 | 1136 | ||
| 2143 | progress ppgs("Writing specifications...", lines.size()); | 1137 | part_of_speech generator::partOfSpeechByWnid(int wnid) |
| 2144 | for (auto line : lines) | ||
| 2145 | { | 1138 | { |
| 2146 | ppgs.update(); | 1139 | switch (wnid / 100000000) |
| 2147 | |||
| 2148 | std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); | ||
| 2149 | std::smatch relation_data; | ||
| 2150 | if (!std::regex_search(line, relation_data, relation)) | ||
| 2151 | { | ||
| 2152 | continue; | ||
| 2153 | } | ||
| 2154 | |||
| 2155 | int synset_id_1 = stoi(relation_data[1]); | ||
| 2156 | int wnum_1 = stoi(relation_data[2]); | ||
| 2157 | int synset_id_2 = stoi(relation_data[3]); | ||
| 2158 | int wnum_2 = stoi(relation_data[4]); | ||
| 2159 | std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); | ||
| 2160 | |||
| 2161 | sqlite3_stmt* ppstmt; | ||
| 2162 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
| 2163 | { | 1140 | { |
| 2164 | db_error(ppdb, query); | 1141 | case 1: return part_of_speech::noun; |
| 1142 | case 2: return part_of_speech::verb; | ||
| 1143 | case 3: return part_of_speech::adjective; | ||
| 1144 | case 4: return part_of_speech::adverb; | ||
| 1145 | default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); | ||
| 2165 | } | 1146 | } |
| 1147 | } | ||
| 2166 | 1148 | ||
| 2167 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | 1149 | notion& generator::createNotion(part_of_speech partOfSpeech) |
| 2168 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | 1150 | { |
| 1151 | notions_.emplace_back(partOfSpeech); | ||
| 1152 | |||
| 1153 | return notions_.back(); | ||
| 1154 | } | ||
| 2169 | 1155 | ||
| 2170 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1156 | notion& generator::lookupOrCreateNotion(int wnid) |
| 1157 | { | ||
| 1158 | if (!notionByWnid_.count(wnid)) | ||
| 2171 | { | 1159 | { |
| 2172 | db_error(ppdb, query); | 1160 | notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); |
| 1161 | notionByWnid_[wnid] = ¬ions_.back(); | ||
| 2173 | } | 1162 | } |
| 2174 | 1163 | ||
| 2175 | sqlite3_finalize(ppstmt); | 1164 | return *notionByWnid_.at(wnid); |
| 2176 | } | ||
| 2177 | } | ||
| 2178 | |||
| 2179 | // sim table | ||
| 2180 | { | ||
| 2181 | std::ifstream wnsimfile(wnpref + "wn_sim.pl"); | ||
| 2182 | if (!wnsimfile.is_open()) | ||
| 2183 | { | ||
| 2184 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 2185 | print_usage(); | ||
| 2186 | } | 1165 | } |
| 2187 | 1166 | ||
| 2188 | std::list<std::string> lines; | 1167 | lemma& generator::lookupOrCreateLemma(std::string base_form) |
| 2189 | for (;;) | ||
| 2190 | { | 1168 | { |
| 2191 | std::string line; | 1169 | if (!lemmaByBaseForm_.count(base_form)) |
| 2192 | if (!getline(wnsimfile, line)) | ||
| 2193 | { | 1170 | { |
| 2194 | break; | 1171 | lemmas_.emplace_back(lookupOrCreateForm(base_form)); |
| 1172 | lemmaByBaseForm_[base_form] = &lemmas_.back(); | ||
| 2195 | } | 1173 | } |
| 1174 | |||
| 1175 | return *lemmaByBaseForm_.at(base_form); | ||
| 1176 | } | ||
| 2196 | 1177 | ||
| 2197 | if (line.back() == '\r') | 1178 | form& generator::lookupOrCreateForm(std::string text) |
| 1179 | { | ||
| 1180 | if (!formByText_.count(text)) | ||
| 2198 | { | 1181 | { |
| 2199 | line.pop_back(); | 1182 | forms_.emplace_back(text); |
| 1183 | formByText_[text] = &forms_.back(); | ||
| 2200 | } | 1184 | } |
| 2201 | 1185 | ||
| 2202 | lines.push_back(line); | 1186 | return *formByText_[text]; |
| 2203 | } | 1187 | } |
| 2204 | 1188 | ||
| 2205 | progress ppgs("Writing sense synonyms...", lines.size()); | 1189 | template <typename... Args> word& generator::createWord(Args&&... args) |
| 2206 | for (auto line : lines) | ||
| 2207 | { | 1190 | { |
| 2208 | ppgs.update(); | 1191 | words_.emplace_back(std::forward<Args>(args)...); |
| 1192 | word& w = words_.back(); | ||
| 2209 | 1193 | ||
| 2210 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); | 1194 | wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); |
| 2211 | std::smatch relation_data; | 1195 | |
| 2212 | if (!std::regex_search(line, relation_data, relation)) | 1196 | if (w.getNotion().hasWnid()) |
| 2213 | { | 1197 | { |
| 2214 | continue; | 1198 | wordsByWnid_[w.getNotion().getWnid()].insert(&w); |
| 2215 | } | 1199 | } |
| 2216 | 1200 | ||
| 2217 | int synset_id_1 = stoi(relation_data[1]); | 1201 | return w; |
| 2218 | int synset_id_2 = stoi(relation_data[2]); | 1202 | } |
| 2219 | std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); | 1203 | |
| 1204 | group& generator::createGroup(xmlNodePtr top) | ||
| 1205 | { | ||
| 1206 | groups_.emplace_back(); | ||
| 1207 | group& grp = groups_.back(); | ||
| 2220 | 1208 | ||
| 2221 | for (auto mapping1 : wn[synset_id_1]) | 1209 | xmlChar* key; |
| 1210 | |||
| 1211 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
| 2222 | { | 1212 | { |
| 2223 | for (auto mapping2 : wn[synset_id_2]) | 1213 | if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES"))) |
| 2224 | { | 1214 | { |
| 2225 | sqlite3_stmt* ppstmt; | 1215 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) |
| 2226 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
| 2227 | { | 1216 | { |
| 2228 | db_error(ppdb, query); | 1217 | if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS"))) |
| 1218 | { | ||
| 1219 | try | ||
| 1220 | { | ||
| 1221 | group& subgrp = createGroup(subclass); | ||
| 1222 | subgrp.setParent(grp); | ||
| 1223 | } catch (const std::exception& e) | ||
| 1224 | { | ||
| 1225 | key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID")); | ||
| 1226 | |||
| 1227 | if (key == nullptr) | ||
| 1228 | { | ||
| 1229 | std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); | ||
| 1230 | } else { | ||
| 1231 | std::string subgroupId(reinterpret_cast<const char*>(key)); | ||
| 1232 | xmlFree(key); | ||
| 1233 | |||
| 1234 | std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); | ||
| 1235 | } | ||
| 1236 | } | ||
| 1237 | } | ||
| 2229 | } | 1238 | } |
| 2230 | 1239 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS"))) | |
| 2231 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1240 | { |
| 2232 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1241 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) |
| 2233 | |||
| 2234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 2235 | { | 1242 | { |
| 2236 | db_error(ppdb, query); | 1243 | if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER"))) |
| 1244 | { | ||
| 1245 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn")); | ||
| 1246 | std::string wnSenses(reinterpret_cast<const char*>(key)); | ||
| 1247 | xmlFree(key); | ||
| 1248 | |||
| 1249 | auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); | ||
| 1250 | if (!wnSenseKeys.empty()) | ||
| 1251 | { | ||
| 1252 | std::list<std::string> tempKeys; | ||
| 1253 | |||
| 1254 | std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { | ||
| 1255 | return sense + "::"; | ||
| 1256 | }); | ||
| 1257 | |||
| 1258 | std::list<std::string> filteredKeys; | ||
| 1259 | |||
| 1260 | std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { | ||
| 1261 | return !wnSenseKeys_.count(sense); | ||
| 1262 | }); | ||
| 1263 | |||
| 1264 | wnSenseKeys = std::move(filteredKeys); | ||
| 1265 | } | ||
| 1266 | |||
| 1267 | if (!wnSenseKeys.empty()) | ||
| 1268 | { | ||
| 1269 | for (std::string sense : wnSenseKeys) | ||
| 1270 | { | ||
| 1271 | word& wordSense = *wnSenseKeys_[sense]; | ||
| 1272 | wordSense.setVerbGroup(grp); | ||
| 1273 | } | ||
| 1274 | } else { | ||
| 1275 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name")); | ||
| 1276 | std::string memberName(reinterpret_cast<const char*>(key)); | ||
| 1277 | xmlFree(key); | ||
| 1278 | |||
| 1279 | notion& n = createNotion(part_of_speech::verb); | ||
| 1280 | lemma& l = lookupOrCreateLemma(memberName); | ||
| 1281 | word& w = createWord(n, l); | ||
| 1282 | |||
| 1283 | w.setVerbGroup(grp); | ||
| 1284 | } | ||
| 1285 | } | ||
| 2237 | } | 1286 | } |
| 2238 | 1287 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES"))) | |
| 2239 | sqlite3_reset(ppstmt); | 1288 | { |
| 2240 | sqlite3_clear_bindings(ppstmt); | 1289 | for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) |
| 2241 | |||
| 2242 | sqlite3_bind_int(ppstmt, 1, mapping2.second); | ||
| 2243 | sqlite3_bind_int(ppstmt, 2, mapping1.second); | ||
| 2244 | |||
| 2245 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
| 2246 | { | 1290 | { |
| 2247 | db_error(ppdb, query); | 1291 | if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE"))) |
| 1292 | { | ||
| 1293 | role r; | ||
| 1294 | |||
| 1295 | key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type")); | ||
| 1296 | std::string roleName = reinterpret_cast<const char*>(key); | ||
| 1297 | xmlFree(key); | ||
| 1298 | |||
| 1299 | for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
| 1300 | { | ||
| 1301 | if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
| 1302 | { | ||
| 1303 | r.setSelrestrs(parseSelrestr(rolenode)); | ||
| 1304 | } | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | grp.addRole(roleName, std::move(r)); | ||
| 1308 | } | ||
| 2248 | } | 1309 | } |
| 1310 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES"))) | ||
| 1311 | { | ||
| 1312 | for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) | ||
| 1313 | { | ||
| 1314 | if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME"))) | ||
| 1315 | { | ||
| 1316 | frames_.emplace_back(); | ||
| 1317 | frame& fr = frames_.back(); | ||
| 2249 | 1318 | ||
| 2250 | sqlite3_finalize(ppstmt); | 1319 | for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) |
| 1320 | { | ||
| 1321 | if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX"))) | ||
| 1322 | { | ||
| 1323 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
| 1324 | { | ||
| 1325 | if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP"))) | ||
| 1326 | { | ||
| 1327 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
| 1328 | std::string partRole = reinterpret_cast<const char*>(key); | ||
| 1329 | xmlFree(key); | ||
| 1330 | |||
| 1331 | selrestr partSelrestrs; | ||
| 1332 | std::set<std::string> partSynrestrs; | ||
| 1333 | |||
| 1334 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 1335 | { | ||
| 1336 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS"))) | ||
| 1337 | { | ||
| 1338 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 1339 | { | ||
| 1340 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR"))) | ||
| 1341 | { | ||
| 1342 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
| 1343 | partSynrestrs.insert(reinterpret_cast<const char*>(key)); | ||
| 1344 | xmlFree(key); | ||
| 1345 | } | ||
| 1346 | } | ||
| 1347 | } | ||
| 1348 | |||
| 1349 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
| 1350 | { | ||
| 1351 | partSelrestrs = parseSelrestr(npnode); | ||
| 1352 | } | ||
| 1353 | } | ||
| 1354 | |||
| 1355 | fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); | ||
| 1356 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB"))) | ||
| 1357 | { | ||
| 1358 | fr.push_back(part::createVerb()); | ||
| 1359 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP"))) | ||
| 1360 | { | ||
| 1361 | std::set<std::string> partChoices; | ||
| 1362 | bool partLiteral; | ||
| 1363 | |||
| 1364 | if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"))) | ||
| 1365 | { | ||
| 1366 | partLiteral = true; | ||
| 1367 | |||
| 1368 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
| 1369 | std::string choicesStr = reinterpret_cast<const char*>(key); | ||
| 1370 | xmlFree(key); | ||
| 1371 | |||
| 1372 | split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); | ||
| 1373 | } else { | ||
| 1374 | partLiteral = false; | ||
| 1375 | |||
| 1376 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
| 1377 | { | ||
| 1378 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
| 1379 | { | ||
| 1380 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
| 1381 | { | ||
| 1382 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
| 1383 | { | ||
| 1384 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
| 1385 | partChoices.insert(reinterpret_cast<const char*>(key)); | ||
| 1386 | xmlFree(key); | ||
| 1387 | } | ||
| 1388 | } | ||
| 1389 | } | ||
| 1390 | } | ||
| 1391 | } | ||
| 1392 | |||
| 1393 | fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); | ||
| 1394 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ"))) | ||
| 1395 | { | ||
| 1396 | fr.push_back(part::createAdjective()); | ||
| 1397 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV"))) | ||
| 1398 | { | ||
| 1399 | fr.push_back(part::createAdverb()); | ||
| 1400 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX"))) | ||
| 1401 | { | ||
| 1402 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
| 1403 | std::string literalValue = reinterpret_cast<const char*>(key); | ||
| 1404 | xmlFree(key); | ||
| 1405 | |||
| 1406 | fr.push_back(part::createLiteral(literalValue)); | ||
| 1407 | } else { | ||
| 1408 | continue; | ||
| 1409 | } | ||
| 1410 | } | ||
| 1411 | |||
| 1412 | grp.addFrame(fr); | ||
| 1413 | } | ||
| 1414 | } | ||
| 1415 | } | ||
| 1416 | } | ||
| 2251 | } | 1417 | } |
| 2252 | } | 1418 | } |
| 2253 | } | ||
| 2254 | } | ||
| 2255 | |||
| 2256 | // syntax table | ||
| 2257 | { | ||
| 2258 | std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); | ||
| 2259 | if (!wnsyntaxfile.is_open()) | ||
| 2260 | { | ||
| 2261 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
| 2262 | print_usage(); | ||
| 2263 | } | ||
| 2264 | 1419 | ||
| 2265 | std::list<std::string> lines; | 1420 | return grp; |
| 2266 | for (;;) | ||
| 2267 | { | ||
| 2268 | std::string line; | ||
| 2269 | if (!getline(wnsyntaxfile, line)) | ||
| 2270 | { | ||
| 2271 | break; | ||
| 2272 | } | ||
| 2273 | |||
| 2274 | if (line.back() == '\r') | ||
| 2275 | { | ||
| 2276 | line.pop_back(); | ||
| 2277 | } | ||
| 2278 | |||
| 2279 | lines.push_back(line); | ||
| 2280 | } | 1421 | } |
| 2281 | 1422 | ||
| 2282 | progress ppgs("Writing adjective syntax markers...", lines.size()); | 1423 | selrestr generator::parseSelrestr(xmlNodePtr top) |
| 2283 | for (auto line : lines) | ||
| 2284 | { | 1424 | { |
| 2285 | ppgs.update(); | 1425 | xmlChar* key; |
| 2286 | 1426 | ||
| 2287 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); | 1427 | if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) |
| 2288 | std::smatch relation_data; | ||
| 2289 | if (!std::regex_search(line, relation_data, relation)) | ||
| 2290 | { | ||
| 2291 | continue; | ||
| 2292 | } | ||
| 2293 | |||
| 2294 | int synset_id = stoi(relation_data[1]); | ||
| 2295 | int wnum = stoi(relation_data[2]); | ||
| 2296 | std::string syn = relation_data[3]; | ||
| 2297 | std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); | ||
| 2298 | |||
| 2299 | sqlite3_stmt* ppstmt; | ||
| 2300 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
| 2301 | { | 1428 | { |
| 2302 | db_error(ppdb, query); | 1429 | if (xmlChildElementCount(top) == 0) |
| 2303 | } | 1430 | { |
| 2304 | 1431 | return {}; | |
| 2305 | sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); | 1432 | } else if (xmlChildElementCount(top) == 1) |
| 2306 | sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); | 1433 | { |
| 2307 | 1434 | return parseSelrestr(xmlFirstElementChild(top)); | |
| 2308 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1435 | } else { |
| 1436 | bool orlogic = false; | ||
| 1437 | if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic"))) | ||
| 1438 | { | ||
| 1439 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic")); | ||
| 1440 | if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or"))) | ||
| 1441 | { | ||
| 1442 | orlogic = true; | ||
| 1443 | } | ||
| 1444 | |||
| 1445 | xmlFree(key); | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | std::list<selrestr> children; | ||
| 1449 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | ||
| 1450 | { | ||
| 1451 | if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS")) | ||
| 1452 | || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
| 1453 | { | ||
| 1454 | children.push_back(parseSelrestr(selrestr)); | ||
| 1455 | } | ||
| 1456 | } | ||
| 1457 | |||
| 1458 | return selrestr(children, orlogic); | ||
| 1459 | } | ||
| 1460 | } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
| 2309 | { | 1461 | { |
| 2310 | db_error(ppdb, query); | 1462 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value")); |
| 1463 | bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+"); | ||
| 1464 | xmlFree(key); | ||
| 1465 | |||
| 1466 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type")); | ||
| 1467 | std::string selRestriction = reinterpret_cast<const char*>(key); | ||
| 1468 | xmlFree(key); | ||
| 1469 | |||
| 1470 | return selrestr(selRestriction, selPos); | ||
| 1471 | } else { | ||
| 1472 | throw std::logic_error("Badly formatted selrestr"); | ||
| 2311 | } | 1473 | } |
| 2312 | |||
| 2313 | sqlite3_finalize(ppstmt); | ||
| 2314 | } | 1474 | } |
| 2315 | } | 1475 | |
| 2316 | 1476 | }; | |
| 2317 | sqlite3_close_v2(ppdb); | 1477 | }; |
| 2318 | |||
| 2319 | std::cout << "Done." << std::endl; | ||
| 2320 | } | ||
| diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h | |||
| @@ -0,0 +1,151 @@ | |||
| 1 | #ifndef GENERATOR_H_5B61CBC5 | ||
| 2 | #define GENERATOR_H_5B61CBC5 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <map> | ||
| 6 | #include <list> | ||
| 7 | #include <set> | ||
| 8 | #include <libxml/parser.h> | ||
| 9 | #include "database.h" | ||
| 10 | #include "notion.h" | ||
| 11 | #include "word.h" | ||
| 12 | #include "lemma.h" | ||
| 13 | #include "form.h" | ||
| 14 | #include "pronunciation.h" | ||
| 15 | #include "group.h" | ||
| 16 | #include "frame.h" | ||
| 17 | |||
| 18 | namespace verbly { | ||
| 19 | namespace generator { | ||
| 20 | |||
| 21 | enum class part_of_speech; | ||
| 22 | class selrestr; | ||
| 23 | |||
| 24 | class generator { | ||
| 25 | public: | ||
| 26 | |||
| 27 | // Constructor | ||
| 28 | |||
| 29 | generator( | ||
| 30 | std::string verbNetPath, | ||
| 31 | std::string agidPath, | ||
| 32 | std::string wordNetPath, | ||
| 33 | std::string cmudictPath, | ||
| 34 | std::string imageNetPath, | ||
| 35 | std::string outputPath); | ||
| 36 | |||
| 37 | // Action | ||
| 38 | |||
| 39 | void run(); | ||
| 40 | |||
| 41 | private: | ||
| 42 | |||
| 43 | // Subroutines | ||
| 44 | |||
| 45 | void readWordNetSynsets(); | ||
| 46 | |||
| 47 | void readAdjectivePositioning(); | ||
| 48 | |||
| 49 | void readImageNetUrls(); | ||
| 50 | |||
| 51 | void readWordNetSenseKeys(); | ||
| 52 | |||
| 53 | void readVerbNet(); | ||
| 54 | |||
| 55 | void readAgidInflections(); | ||
| 56 | |||
| 57 | void readPrepositions(); | ||
| 58 | |||
| 59 | void readCmudictPronunciations(); | ||
| 60 | |||
| 61 | void writeSchema(); | ||
| 62 | |||
| 63 | void dumpObjects(); | ||
| 64 | |||
| 65 | void readWordNetAntonymy(); | ||
| 66 | |||
| 67 | void readWordNetVariation(); | ||
| 68 | |||
| 69 | void readWordNetClasses(); | ||
| 70 | |||
| 71 | void readWordNetCausality(); | ||
| 72 | |||
| 73 | void readWordNetEntailment(); | ||
| 74 | |||
| 75 | void readWordNetHypernymy(); | ||
| 76 | |||
| 77 | void readWordNetInstantiation(); | ||
| 78 | |||
| 79 | void readWordNetMemberMeronymy(); | ||
| 80 | |||
| 81 | void readWordNetPartMeronymy(); | ||
| 82 | |||
| 83 | void readWordNetSubstanceMeronymy(); | ||
| 84 | |||
| 85 | void readWordNetPertainymy(); | ||
| 86 | |||
| 87 | void readWordNetSpecification(); | ||
| 88 | |||
| 89 | void readWordNetSimilarity(); | ||
| 90 | |||
| 91 | // Helpers | ||
| 92 | |||
| 93 | std::list<std::string> readFile(std::string path); | ||
| 94 | |||
| 95 | inline part_of_speech partOfSpeechByWnid(int wnid); | ||
| 96 | |||
| 97 | notion& createNotion(part_of_speech partOfSpeech); | ||
| 98 | |||
| 99 | notion& lookupOrCreateNotion(int wnid); | ||
| 100 | |||
| 101 | lemma& lookupOrCreateLemma(std::string base_form); | ||
| 102 | |||
| 103 | form& lookupOrCreateForm(std::string text); | ||
| 104 | |||
| 105 | template <typename... Args> word& createWord(Args&&... args); | ||
| 106 | |||
| 107 | group& createGroup(xmlNodePtr top); | ||
| 108 | |||
| 109 | selrestr parseSelrestr(xmlNodePtr top); | ||
| 110 | |||
| 111 | // Input | ||
| 112 | |||
| 113 | std::string verbNetPath_; | ||
| 114 | std::string agidPath_; | ||
| 115 | std::string wordNetPath_; | ||
| 116 | std::string cmudictPath_; | ||
| 117 | std::string imageNetPath_; | ||
| 118 | |||
| 119 | // Output | ||
| 120 | |||
| 121 | database db_; | ||
| 122 | |||
| 123 | // Data | ||
| 124 | |||
| 125 | std::list<notion> notions_; | ||
| 126 | std::list<word> words_; | ||
| 127 | std::list<lemma> lemmas_; | ||
| 128 | std::list<form> forms_; | ||
| 129 | std::list<pronunciation> pronunciations_; | ||
| 130 | std::list<frame> frames_; | ||
| 131 | std::list<group> groups_; | ||
| 132 | |||
| 133 | // Indexes | ||
| 134 | |||
| 135 | std::map<int, notion*> notionByWnid_; | ||
| 136 | std::map<int, std::set<word*>> wordsByWnid_; | ||
| 137 | std::map<std::pair<int, int>, word*> wordByWnidAndWnum_; | ||
| 138 | std::map<std::string, std::set<word*>> wordsByBaseForm_; | ||
| 139 | std::map<std::string, lemma*> lemmaByBaseForm_; | ||
| 140 | std::map<std::string, form*> formByText_; | ||
| 141 | |||
| 142 | // Caches | ||
| 143 | |||
| 144 | std::map<std::string, word*> wnSenseKeys_; | ||
| 145 | |||
| 146 | }; | ||
| 147 | |||
| 148 | }; | ||
| 149 | }; | ||
| 150 | |||
| 151 | #endif /* end of include guard: GENERATOR_H_5B61CBC5 */ | ||
| diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp | |||
| @@ -0,0 +1,119 @@ | |||
| 1 | #include "group.h" | ||
| 2 | #include <stdexcept> | ||
| 3 | #include <list> | ||
| 4 | #include <json.hpp> | ||
| 5 | #include "database.h" | ||
| 6 | #include "field.h" | ||
| 7 | #include "frame.h" | ||
| 8 | |||
| 9 | namespace verbly { | ||
| 10 | namespace generator { | ||
| 11 | |||
| 12 | int group::nextId_ = 0; | ||
| 13 | |||
| 14 | group::group() : id_(nextId_++) | ||
| 15 | { | ||
| 16 | } | ||
| 17 | |||
| 18 | void group::setParent(const group& parent) | ||
| 19 | { | ||
| 20 | // Adding a group to itself is nonsensical. | ||
| 21 | assert(&parent != this); | ||
| 22 | |||
| 23 | parent_ = &parent; | ||
| 24 | } | ||
| 25 | |||
| 26 | void group::addRole(std::string name, role r) | ||
| 27 | { | ||
| 28 | roleNames_.insert(name); | ||
| 29 | roles_[name] = std::move(r); | ||
| 30 | } | ||
| 31 | |||
| 32 | void group::addFrame(const frame& f) | ||
| 33 | { | ||
| 34 | frames_.insert(&f); | ||
| 35 | } | ||
| 36 | |||
| 37 | std::set<std::string> group::getRoles() const | ||
| 38 | { | ||
| 39 | std::set<std::string> fullRoles = roleNames_; | ||
| 40 | |||
| 41 | if (hasParent()) | ||
| 42 | { | ||
| 43 | for (std::string name : getParent().getRoles()) | ||
| 44 | { | ||
| 45 | fullRoles.insert(name); | ||
| 46 | } | ||
| 47 | } | ||
| 48 | |||
| 49 | return fullRoles; | ||
| 50 | } | ||
| 51 | |||
| 52 | const role& group::getRole(std::string name) const | ||
| 53 | { | ||
| 54 | if (roles_.count(name)) | ||
| 55 | { | ||
| 56 | return roles_.at(name); | ||
| 57 | } else if (hasParent()) | ||
| 58 | { | ||
| 59 | return getParent().getRole(name); | ||
| 60 | } else { | ||
| 61 | throw std::invalid_argument("Specified role not found in verb group"); | ||
| 62 | } | ||
| 63 | } | ||
| 64 | |||
| 65 | std::set<const frame*> group::getFrames() const | ||
| 66 | { | ||
| 67 | std::set<const frame*> fullFrames = frames_; | ||
| 68 | |||
| 69 | if (hasParent()) | ||
| 70 | { | ||
| 71 | for (const frame* f : getParent().getFrames()) | ||
| 72 | { | ||
| 73 | fullFrames.insert(f); | ||
| 74 | } | ||
| 75 | } | ||
| 76 | |||
| 77 | return fullFrames; | ||
| 78 | } | ||
| 79 | |||
| 80 | database& operator<<(database& db, const group& arg) | ||
| 81 | { | ||
| 82 | // Serialize the group first | ||
| 83 | { | ||
| 84 | std::list<field> fields; | ||
| 85 | fields.emplace_back("group_id", arg.getId()); | ||
| 86 | |||
| 87 | nlohmann::json jsonRoles; | ||
| 88 | for (std::string name : arg.getRoles()) | ||
| 89 | { | ||
| 90 | const role& r = arg.getRole(name); | ||
| 91 | |||
| 92 | nlohmann::json jsonRole; | ||
| 93 | jsonRole["type"] = name; | ||
| 94 | jsonRole["selrestrs"] = r.getSelrestrs().toJson(); | ||
| 95 | |||
| 96 | jsonRoles.emplace_back(std::move(jsonRole)); | ||
| 97 | } | ||
| 98 | |||
| 99 | fields.emplace_back("data", jsonRoles.dump()); | ||
| 100 | |||
| 101 | db.insertIntoTable("groups", std::move(fields)); | ||
| 102 | } | ||
| 103 | |||
| 104 | // Then, serialize the group/frame relationship | ||
| 105 | for (const frame* f : arg.getFrames()) | ||
| 106 | { | ||
| 107 | std::list<field> fields; | ||
| 108 | |||
| 109 | fields.emplace_back("group_id", arg.getId()); | ||
| 110 | fields.emplace_back("frame_id", f->getId()); | ||
| 111 | |||
| 112 | db.insertIntoTable("groups_frames", std::move(fields)); | ||
| 113 | } | ||
| 114 | |||
| 115 | return db; | ||
| 116 | } | ||
| 117 | |||
| 118 | }; | ||
| 119 | }; | ||
| diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h | |||
| @@ -0,0 +1,80 @@ | |||
| 1 | #ifndef GROUP_H_EDAFB5DC | ||
| 2 | #define GROUP_H_EDAFB5DC | ||
| 3 | |||
| 4 | #include <map> | ||
| 5 | #include <set> | ||
| 6 | #include <string> | ||
| 7 | #include <cassert> | ||
| 8 | #include "role.h" | ||
| 9 | |||
| 10 | namespace verbly { | ||
| 11 | namespace generator { | ||
| 12 | |||
| 13 | class frame; | ||
| 14 | class database; | ||
| 15 | |||
| 16 | class group { | ||
| 17 | public: | ||
| 18 | |||
| 19 | // Constructor | ||
| 20 | |||
| 21 | group(); | ||
| 22 | |||
| 23 | // Mutators | ||
| 24 | |||
| 25 | void setParent(const group& parent); | ||
| 26 | |||
| 27 | void addRole(std::string name, role r); | ||
| 28 | |||
| 29 | void addFrame(const frame& f); | ||
| 30 | |||
| 31 | // Accessors | ||
| 32 | |||
| 33 | int getId() const | ||
| 34 | { | ||
| 35 | return id_; | ||
| 36 | } | ||
| 37 | |||
| 38 | bool hasParent() const | ||
| 39 | { | ||
| 40 | return (parent_ != nullptr); | ||
| 41 | } | ||
| 42 | |||
| 43 | const group& getParent() const | ||
| 44 | { | ||
| 45 | // Calling code should always call hasParent first | ||
| 46 | assert(parent_ != nullptr); | ||
| 47 | |||
| 48 | return *parent_; | ||
| 49 | } | ||
| 50 | |||
| 51 | std::set<std::string> getRoles() const; | ||
| 52 | |||
| 53 | const role& getRole(std::string name) const; | ||
| 54 | |||
| 55 | std::set<const frame*> getFrames() const; | ||
| 56 | |||
| 57 | private: | ||
| 58 | |||
| 59 | static int nextId_; | ||
| 60 | |||
| 61 | const int id_; | ||
| 62 | |||
| 63 | const group* parent_ = nullptr; | ||
| 64 | std::map<std::string, role> roles_; | ||
| 65 | std::set<const frame*> frames_; | ||
| 66 | |||
| 67 | // Caches | ||
| 68 | |||
| 69 | std::set<std::string> roleNames_; | ||
| 70 | |||
| 71 | }; | ||
| 72 | |||
| 73 | // Serializer | ||
| 74 | |||
| 75 | database& operator<<(database& db, const group& arg); | ||
| 76 | |||
| 77 | }; | ||
| 78 | }; | ||
| 79 | |||
| 80 | #endif /* end of include guard: GROUP_H_EDAFB5DC */ | ||
| diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp | |||
| @@ -0,0 +1,65 @@ | |||
| 1 | #include "lemma.h" | ||
| 2 | #include <list> | ||
| 3 | #include <cassert> | ||
| 4 | #include "field.h" | ||
| 5 | #include "database.h" | ||
| 6 | #include "form.h" | ||
| 7 | |||
| 8 | namespace verbly { | ||
| 9 | namespace generator { | ||
| 10 | |||
| 11 | int lemma::nextId_ = 0; | ||
| 12 | |||
| 13 | lemma::lemma(const form& baseForm) : | ||
| 14 | id_(nextId_++), | ||
| 15 | baseForm_(baseForm) | ||
| 16 | { | ||
| 17 | inflections_[inflection::base] = {&baseForm}; | ||
| 18 | } | ||
| 19 | |||
| 20 | void lemma::addInflection(inflection type, const form& f) | ||
| 21 | { | ||
| 22 | // There can only be one base form. | ||
| 23 | assert(type != inflection::base); | ||
| 24 | |||
| 25 | inflections_[type].insert(&f); | ||
| 26 | } | ||
| 27 | |||
| 28 | std::set<const form*> lemma::getInflections(inflection type) const | ||
| 29 | { | ||
| 30 | if (inflections_.count(type)) | ||
| 31 | { | ||
| 32 | return inflections_.at(type); | ||
| 33 | } else { | ||
| 34 | return {}; | ||
| 35 | } | ||
| 36 | } | ||
| 37 | |||
| 38 | database& operator<<(database& db, const lemma& arg) | ||
| 39 | { | ||
| 40 | for (inflection type : { | ||
| 41 | inflection::base, | ||
| 42 | inflection::plural, | ||
| 43 | inflection::comparative, | ||
| 44 | inflection::superlative, | ||
| 45 | inflection::past_tense, | ||
| 46 | inflection::past_participle, | ||
| 47 | inflection::ing_form, | ||
| 48 | inflection::s_form}) | ||
| 49 | { | ||
| 50 | for (const form* f : arg.getInflections(type)) | ||
| 51 | { | ||
| 52 | std::list<field> fields; | ||
| 53 | fields.emplace_back("lemma_id", arg.getId()); | ||
| 54 | fields.emplace_back("form_id", f->getId()); | ||
| 55 | fields.emplace_back("category", static_cast<int>(type)); | ||
| 56 | |||
| 57 | db.insertIntoTable("lemmas_forms", std::move(fields)); | ||
| 58 | } | ||
| 59 | } | ||
| 60 | |||
| 61 | return db; | ||
| 62 | } | ||
| 63 | |||
| 64 | }; | ||
| 65 | }; | ||
| diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | #ifndef LEMMA_H_D73105A7 | ||
| 2 | #define LEMMA_H_D73105A7 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <map> | ||
| 6 | #include <set> | ||
| 7 | #include "enums.h" | ||
| 8 | |||
| 9 | namespace verbly { | ||
| 10 | namespace generator { | ||
| 11 | |||
| 12 | class database; | ||
| 13 | class form; | ||
| 14 | |||
| 15 | class lemma { | ||
| 16 | public: | ||
| 17 | |||
| 18 | // Constructors | ||
| 19 | |||
| 20 | explicit lemma(const form& baseForm); | ||
| 21 | |||
| 22 | // Mutators | ||
| 23 | |||
| 24 | void addInflection(inflection type, const form& f); | ||
| 25 | |||
| 26 | // Accessors | ||
| 27 | |||
| 28 | int getId() const | ||
| 29 | { | ||
| 30 | return id_; | ||
| 31 | } | ||
| 32 | |||
| 33 | const form& getBaseForm() const | ||
| 34 | { | ||
| 35 | return baseForm_; | ||
| 36 | } | ||
| 37 | |||
| 38 | std::set<const form*> getInflections(inflection type) const; | ||
| 39 | |||
| 40 | private: | ||
| 41 | |||
| 42 | static int nextId_; | ||
| 43 | |||
| 44 | const int id_; | ||
| 45 | const form& baseForm_; | ||
| 46 | |||
| 47 | std::map<inflection, std::set<const form*>> inflections_; | ||
| 48 | |||
| 49 | }; | ||
| 50 | |||
| 51 | // Serializer | ||
| 52 | |||
| 53 | database& operator<<(database& db, const lemma& arg); | ||
| 54 | |||
| 55 | }; | ||
| 56 | }; | ||
| 57 | |||
| 58 | #endif /* end of include guard: LEMMA_H_D73105A7 */ | ||
| diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp | |||
| @@ -0,0 +1,40 @@ | |||
| 1 | #include <iostream> | ||
| 2 | #include <exception> | ||
| 3 | #include "generator.h" | ||
| 4 | |||
| 5 | void printUsage() | ||
| 6 | { | ||
| 7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl; | ||
| 8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; | ||
| 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | ||
| 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; | ||
| 11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | ||
| 12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; | ||
| 13 | std::cout << "output :: datafile output path" << std::endl; | ||
| 14 | } | ||
| 15 | |||
| 16 | int main(int argc, char** argv) | ||
| 17 | { | ||
| 18 | if (argc == 7) | ||
| 19 | { | ||
| 20 | try | ||
| 21 | { | ||
| 22 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); | ||
| 23 | |||
| 24 | try | ||
| 25 | { | ||
| 26 | app.run(); | ||
| 27 | } catch (const std::exception& e) | ||
| 28 | { | ||
| 29 | std::cout << e.what() << std::endl; | ||
| 30 | } | ||
| 31 | } catch (const std::exception& e) | ||
| 32 | { | ||
| 33 | std::cout << e.what() << std::endl; | ||
| 34 | printUsage(); | ||
| 35 | } | ||
| 36 | } else { | ||
| 37 | std::cout << "verbly datafile generator" << std::endl; | ||
| 38 | printUsage(); | ||
| 39 | } | ||
| 40 | } | ||
| diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp | |||
| @@ -0,0 +1,85 @@ | |||
| 1 | #include "notion.h" | ||
| 2 | #include <string> | ||
| 3 | #include <list> | ||
| 4 | #include "database.h" | ||
| 5 | #include "field.h" | ||
| 6 | |||
| 7 | namespace verbly { | ||
| 8 | namespace generator { | ||
| 9 | |||
| 10 | int notion::nextId_ = 0; | ||
| 11 | |||
| 12 | notion::notion( | ||
| 13 | part_of_speech partOfSpeech) : | ||
| 14 | id_(nextId_++), | ||
| 15 | partOfSpeech_(partOfSpeech) | ||
| 16 | { | ||
| 17 | } | ||
| 18 | |||
| 19 | notion::notion( | ||
| 20 | part_of_speech partOfSpeech, | ||
| 21 | int wnid) : | ||
| 22 | id_(nextId_++), | ||
| 23 | partOfSpeech_(partOfSpeech), | ||
| 24 | wnid_(wnid), | ||
| 25 | hasWnid_(true) | ||
| 26 | { | ||
| 27 | } | ||
| 28 | |||
| 29 | void notion::incrementNumOfImages() | ||
| 30 | { | ||
| 31 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
| 32 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
| 33 | |||
| 34 | numOfImages_++; | ||
| 35 | } | ||
| 36 | |||
| 37 | void notion::setPrepositionGroups(std::list<std::string> groups) | ||
| 38 | { | ||
| 39 | // Calling code should always check that the notion is a preposition first. | ||
| 40 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
| 41 | |||
| 42 | prepositionGroups_ = groups; | ||
| 43 | } | ||
| 44 | |||
| 45 | database& operator<<(database& db, const notion& arg) | ||
| 46 | { | ||
| 47 | // First, serialize the notion | ||
| 48 | { | ||
| 49 | std::list<field> fields; | ||
| 50 | |||
| 51 | fields.emplace_back("notion_id", arg.getId()); | ||
| 52 | fields.emplace_back("part_of_speech", static_cast<int>(arg.getPartOfSpeech())); | ||
| 53 | |||
| 54 | if (arg.hasWnid()) | ||
| 55 | { | ||
| 56 | fields.emplace_back("wnid", arg.getWnid()); | ||
| 57 | |||
| 58 | if (arg.getPartOfSpeech() == part_of_speech::noun) | ||
| 59 | { | ||
| 60 | fields.emplace_back("images", arg.getNumOfImages()); | ||
| 61 | } | ||
| 62 | } | ||
| 63 | |||
| 64 | db.insertIntoTable("notions", std::move(fields)); | ||
| 65 | } | ||
| 66 | |||
| 67 | // Next, serialize the is_a relationship if this is a preposition | ||
| 68 | if (arg.getPartOfSpeech() == part_of_speech::preposition) | ||
| 69 | { | ||
| 70 | for (std::string group : arg.getPrepositionGroups()) | ||
| 71 | { | ||
| 72 | std::list<field> fields; | ||
| 73 | |||
| 74 | fields.emplace_back("notion_id", arg.getId()); | ||
| 75 | fields.emplace_back("groupname", group); | ||
| 76 | |||
| 77 | db.insertIntoTable("is_a", std::move(fields)); | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | return db; | ||
| 82 | } | ||
| 83 | |||
| 84 | }; | ||
| 85 | }; | ||
| diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h | |||
| @@ -0,0 +1,91 @@ | |||
| 1 | #ifndef NOTION_H_221DE2BC | ||
| 2 | #define NOTION_H_221DE2BC | ||
| 3 | |||
| 4 | #include <cassert> | ||
| 5 | #include <list> | ||
| 6 | #include <string> | ||
| 7 | #include "enums.h" | ||
| 8 | |||
| 9 | namespace verbly { | ||
| 10 | namespace generator { | ||
| 11 | |||
| 12 | class database; | ||
| 13 | |||
| 14 | class notion { | ||
| 15 | public: | ||
| 16 | |||
| 17 | // Constructors | ||
| 18 | |||
| 19 | explicit notion(part_of_speech partOfSpeech); | ||
| 20 | |||
| 21 | notion(part_of_speech partOfSpeech, int wnid); | ||
| 22 | |||
| 23 | // Mutators | ||
| 24 | |||
| 25 | void incrementNumOfImages(); | ||
| 26 | |||
| 27 | void setPrepositionGroups(std::list<std::string> groups); | ||
| 28 | |||
| 29 | // Accessors | ||
| 30 | |||
| 31 | int getId() const | ||
| 32 | { | ||
| 33 | return id_; | ||
| 34 | } | ||
| 35 | |||
| 36 | part_of_speech getPartOfSpeech() const | ||
| 37 | { | ||
| 38 | return partOfSpeech_; | ||
| 39 | } | ||
| 40 | |||
| 41 | bool hasWnid() const | ||
| 42 | { | ||
| 43 | return hasWnid_; | ||
| 44 | } | ||
| 45 | |||
| 46 | int getWnid() const | ||
| 47 | { | ||
| 48 | // Calling code should always call hasWnid first. | ||
| 49 | assert(hasWnid_); | ||
| 50 | |||
| 51 | return wnid_; | ||
| 52 | } | ||
| 53 | |||
| 54 | int getNumOfImages() const | ||
| 55 | { | ||
| 56 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
| 57 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
| 58 | |||
| 59 | return numOfImages_; | ||
| 60 | } | ||
| 61 | |||
| 62 | std::list<std::string> getPrepositionGroups() const | ||
| 63 | { | ||
| 64 | // Calling code should always check that the notion is a preposition first. | ||
| 65 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
| 66 | |||
| 67 | return prepositionGroups_; | ||
| 68 | } | ||
| 69 | |||
| 70 | private: | ||
| 71 | |||
| 72 | static int nextId_; | ||
| 73 | |||
| 74 | const int id_; | ||
| 75 | const part_of_speech partOfSpeech_; | ||
| 76 | const int wnid_ = 0; | ||
| 77 | const bool hasWnid_ = false; | ||
| 78 | |||
| 79 | int numOfImages_ = 0; | ||
| 80 | std::list<std::string> prepositionGroups_; | ||
| 81 | |||
| 82 | }; | ||
| 83 | |||
| 84 | // Serializer | ||
| 85 | |||
| 86 | database& operator<<(database& db, const notion& arg); | ||
| 87 | |||
| 88 | }; | ||
| 89 | }; | ||
| 90 | |||
| 91 | #endif /* end of include guard: NOTION_H_221DE2BC */ | ||
| diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp | |||
| @@ -0,0 +1,336 @@ | |||
| 1 | #include "part.h" | ||
| 2 | #include <stdexcept> | ||
| 3 | #include "selrestr.h" | ||
| 4 | |||
| 5 | namespace verbly { | ||
| 6 | namespace generator { | ||
| 7 | |||
| 8 | part part::createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs) | ||
| 9 | { | ||
| 10 | part p(type::noun_phrase); | ||
| 11 | |||
| 12 | new(&p.noun_phrase_.role) std::string(std::move(role)); | ||
| 13 | new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs)); | ||
| 14 | new(&p.noun_phrase_.synrestrs) std::set<std::string>(std::move(synrestrs)); | ||
| 15 | |||
| 16 | return p; | ||
| 17 | } | ||
| 18 | |||
| 19 | part part::createVerb() | ||
| 20 | { | ||
| 21 | return part(type::verb); | ||
| 22 | } | ||
| 23 | |||
| 24 | part part::createPreposition(std::set<std::string> choices, bool literal) | ||
| 25 | { | ||
| 26 | part p(type::preposition); | ||
| 27 | |||
| 28 | new(&p.preposition_.choices) std::set<std::string>(std::move(choices)); | ||
| 29 | p.preposition_.literal = literal; | ||
| 30 | |||
| 31 | return p; | ||
| 32 | } | ||
| 33 | |||
| 34 | part part::createAdjective() | ||
| 35 | { | ||
| 36 | return part(type::adjective); | ||
| 37 | } | ||
| 38 | |||
| 39 | part part::createAdverb() | ||
| 40 | { | ||
| 41 | return part(type::adverb); | ||
| 42 | } | ||
| 43 | |||
| 44 | part part::createLiteral(std::string value) | ||
| 45 | { | ||
| 46 | part p(type::literal); | ||
| 47 | |||
| 48 | new(&p.literal_) std::string(std::move(value)); | ||
| 49 | |||
| 50 | return p; | ||
| 51 | } | ||
| 52 | |||
| 53 | part::part(const part& other) | ||
| 54 | { | ||
| 55 | type_ = other.type_; | ||
| 56 | |||
| 57 | switch (type_) | ||
| 58 | { | ||
| 59 | case type::noun_phrase: | ||
| 60 | { | ||
| 61 | new(&noun_phrase_.role) std::string(other.noun_phrase_.role); | ||
| 62 | new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs); | ||
| 63 | new(&noun_phrase_.synrestrs) std::set<std::string>(other.noun_phrase_.synrestrs); | ||
| 64 | |||
| 65 | break; | ||
| 66 | } | ||
| 67 | |||
| 68 | case type::preposition: | ||
| 69 | { | ||
| 70 | new(&preposition_.choices) std::set<std::string>(other.preposition_.choices); | ||
| 71 | preposition_.literal = other.preposition_.literal; | ||
| 72 | |||
| 73 | break; | ||
| 74 | } | ||
| 75 | |||
| 76 | case type::literal: | ||
| 77 | { | ||
| 78 | new(&literal_) std::string(other.literal_); | ||
| 79 | |||
| 80 | break; | ||
| 81 | } | ||
| 82 | |||
| 83 | case type::verb: | ||
| 84 | case type::adjective: | ||
| 85 | case type::adverb: | ||
| 86 | case type::invalid: | ||
| 87 | { | ||
| 88 | break; | ||
| 89 | } | ||
| 90 | } | ||
| 91 | } | ||
| 92 | |||
| 93 | part::part(part&& other) : part() | ||
| 94 | { | ||
| 95 | swap(*this, other); | ||
| 96 | } | ||
| 97 | |||
| 98 | part& part::operator=(part other) | ||
| 99 | { | ||
| 100 | swap(*this, other); | ||
| 101 | |||
| 102 | return *this; | ||
| 103 | } | ||
| 104 | |||
| 105 | void swap(part& first, part& second) | ||
| 106 | { | ||
| 107 | using type = part::type; | ||
| 108 | |||
| 109 | type tempType = first.type_; | ||
| 110 | std::string tempRole; | ||
| 111 | selrestr tempSelrestrs; | ||
| 112 | std::set<std::string> tempSynrestrs; | ||
| 113 | std::set<std::string> tempChoices; | ||
| 114 | bool tempPrepLiteral; | ||
| 115 | std::string tempLiteralValue; | ||
| 116 | |||
| 117 | switch (tempType) | ||
| 118 | { | ||
| 119 | case type::noun_phrase: | ||
| 120 | { | ||
| 121 | tempRole = std::move(first.noun_phrase_.role); | ||
| 122 | tempSelrestrs = std::move(first.noun_phrase_.selrestrs); | ||
| 123 | tempSynrestrs = std::move(first.noun_phrase_.synrestrs); | ||
| 124 | |||
| 125 | break; | ||
| 126 | } | ||
| 127 | |||
| 128 | case type::preposition: | ||
| 129 | { | ||
| 130 | tempChoices = std::move(first.preposition_.choices); | ||
| 131 | tempPrepLiteral = first.preposition_.literal; | ||
| 132 | |||
| 133 | break; | ||
| 134 | } | ||
| 135 | |||
| 136 | case type::literal: | ||
| 137 | { | ||
| 138 | tempLiteralValue = std::move(first.literal_); | ||
| 139 | |||
| 140 | break; | ||
| 141 | } | ||
| 142 | |||
| 143 | case type::verb: | ||
| 144 | case type::adjective: | ||
| 145 | case type::adverb: | ||
| 146 | case type::invalid: | ||
| 147 | { | ||
| 148 | break; | ||
| 149 | } | ||
| 150 | } | ||
| 151 | |||
| 152 | first.~part(); | ||
| 153 | |||
| 154 | first.type_ = second.type_; | ||
| 155 | |||
| 156 | switch (first.type_) | ||
| 157 | { | ||
| 158 | case type::noun_phrase: | ||
| 159 | { | ||
| 160 | new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role)); | ||
| 161 | new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs)); | ||
| 162 | new(&first.noun_phrase_.synrestrs) std::set<std::string>(std::move(second.noun_phrase_.synrestrs)); | ||
| 163 | |||
| 164 | break; | ||
| 165 | } | ||
| 166 | |||
| 167 | case type::preposition: | ||
| 168 | { | ||
| 169 | new(&first.preposition_.choices) std::set<std::string>(std::move(second.preposition_.choices)); | ||
| 170 | first.preposition_.literal = second.preposition_.literal; | ||
| 171 | |||
| 172 | break; | ||
| 173 | } | ||
| 174 | |||
| 175 | case type::literal: | ||
| 176 | { | ||
| 177 | new(&first.literal_) std::string(std::move(second.literal_)); | ||
| 178 | |||
| 179 | break; | ||
| 180 | } | ||
| 181 | |||
| 182 | case type::verb: | ||
| 183 | case type::adjective: | ||
| 184 | case type::adverb: | ||
| 185 | case type::invalid: | ||
| 186 | { | ||
| 187 | break; | ||
| 188 | } | ||
| 189 | } | ||
| 190 | |||
| 191 | second.~part(); | ||
| 192 | |||
| 193 | second.type_ = tempType; | ||
| 194 | |||
| 195 | switch (second.type_) | ||
| 196 | { | ||
| 197 | case type::noun_phrase: | ||
| 198 | { | ||
| 199 | new(&second.noun_phrase_.role) std::string(std::move(tempRole)); | ||
| 200 | new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs)); | ||
| 201 | new(&second.noun_phrase_.synrestrs) std::set<std::string>(std::move(tempSynrestrs)); | ||
| 202 | |||
| 203 | break; | ||
| 204 | } | ||
| 205 | |||
| 206 | case type::preposition: | ||
| 207 | { | ||
| 208 | new(&second.preposition_.choices) std::set<std::string>(std::move(tempChoices)); | ||
| 209 | second.preposition_.literal = tempPrepLiteral; | ||
| 210 | |||
| 211 | break; | ||
| 212 | } | ||
| 213 | |||
| 214 | case type::literal: | ||
| 215 | { | ||
| 216 | new(&second.literal_) std::string(std::move(tempLiteralValue)); | ||
| 217 | |||
| 218 | break; | ||
| 219 | } | ||
| 220 | |||
| 221 | case type::verb: | ||
| 222 | case type::adjective: | ||
| 223 | case type::adverb: | ||
| 224 | case type::invalid: | ||
| 225 | { | ||
| 226 | break; | ||
| 227 | } | ||
| 228 | } | ||
| 229 | } | ||
| 230 | |||
| 231 | part::~part() | ||
| 232 | { | ||
| 233 | switch (type_) | ||
| 234 | { | ||
| 235 | case type::noun_phrase: | ||
| 236 | { | ||
| 237 | using string_type = std::string; | ||
| 238 | using set_type = std::set<std::string>; | ||
| 239 | |||
| 240 | noun_phrase_.role.~string_type(); | ||
| 241 | noun_phrase_.selrestrs.~selrestr(); | ||
| 242 | noun_phrase_.synrestrs.~set_type(); | ||
| 243 | |||
| 244 | break; | ||
| 245 | } | ||
| 246 | |||
| 247 | case type::preposition: | ||
| 248 | { | ||
| 249 | using set_type = std::set<std::string>; | ||
| 250 | |||
| 251 | preposition_.choices.~set_type(); | ||
| 252 | |||
| 253 | break; | ||
| 254 | } | ||
| 255 | |||
| 256 | case type::literal: | ||
| 257 | { | ||
| 258 | using string_type = std::string; | ||
| 259 | |||
| 260 | literal_.~string_type(); | ||
| 261 | |||
| 262 | break; | ||
| 263 | } | ||
| 264 | |||
| 265 | case type::verb: | ||
| 266 | case type::adjective: | ||
| 267 | case type::adverb: | ||
| 268 | case type::invalid: | ||
| 269 | { | ||
| 270 | break; | ||
| 271 | } | ||
| 272 | } | ||
| 273 | } | ||
| 274 | |||
| 275 | std::string part::getNounRole() const | ||
| 276 | { | ||
| 277 | if (type_ == type::noun_phrase) | ||
| 278 | { | ||
| 279 | return noun_phrase_.role; | ||
| 280 | } else { | ||
| 281 | throw std::domain_error("part::getNounRole is only valid for noun phrase parts"); | ||
| 282 | } | ||
| 283 | } | ||
| 284 | |||
| 285 | selrestr part::getNounSelrestrs() const | ||
| 286 | { | ||
| 287 | if (type_ == type::noun_phrase) | ||
| 288 | { | ||
| 289 | return noun_phrase_.selrestrs; | ||
| 290 | } else { | ||
| 291 | throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts"); | ||
| 292 | } | ||
| 293 | } | ||
| 294 | |||
| 295 | std::set<std::string> part::getNounSynrestrs() const | ||
| 296 | { | ||
| 297 | if (type_ == type::noun_phrase) | ||
| 298 | { | ||
| 299 | return noun_phrase_.synrestrs; | ||
| 300 | } else { | ||
| 301 | throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts"); | ||
| 302 | } | ||
| 303 | } | ||
| 304 | |||
| 305 | std::set<std::string> part::getPrepositionChoices() const | ||
| 306 | { | ||
| 307 | if (type_ == type::preposition) | ||
| 308 | { | ||
| 309 | return preposition_.choices; | ||
| 310 | } else { | ||
| 311 | throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts"); | ||
| 312 | } | ||
| 313 | } | ||
| 314 | |||
| 315 | bool part::isPrepositionLiteral() const | ||
| 316 | { | ||
| 317 | if (type_ == type::preposition) | ||
| 318 | { | ||
| 319 | return preposition_.literal; | ||
| 320 | } else { | ||
| 321 | throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts"); | ||
| 322 | } | ||
| 323 | } | ||
| 324 | |||
| 325 | std::string part::getLiteralValue() const | ||
| 326 | { | ||
| 327 | if (type_ == type::literal) | ||
| 328 | { | ||
| 329 | return literal_; | ||
| 330 | } else { | ||
| 331 | throw std::domain_error("part::getLiteralValue is only valid for literal parts"); | ||
| 332 | } | ||
| 333 | } | ||
| 334 | |||
| 335 | }; | ||
| 336 | }; | ||
| diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h | |||
| @@ -0,0 +1,114 @@ | |||
| 1 | #ifndef PART_H_FB54F361 | ||
| 2 | #define PART_H_FB54F361 | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <set> | ||
| 6 | #include "selrestr.h" | ||
| 7 | |||
| 8 | namespace verbly { | ||
| 9 | namespace generator { | ||
| 10 | |||
| 11 | class part { | ||
| 12 | public: | ||
| 13 | enum class type { | ||
| 14 | invalid = -1, | ||
| 15 | noun_phrase = 0, | ||
| 16 | verb = 1, | ||
| 17 | preposition = 2, | ||
| 18 | adjective = 3, | ||
| 19 | adverb = 4, | ||
| 20 | literal = 5 | ||
| 21 | }; | ||
| 22 | |||
| 23 | // Static factories | ||
| 24 | |||
| 25 | static part createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs); | ||
| 26 | |||
| 27 | static part createVerb(); | ||
| 28 | |||
| 29 | static part createPreposition(std::set<std::string> choices, bool literal); | ||
| 30 | |||
| 31 | static part createAdjective(); | ||
| 32 | |||
| 33 | static part createAdverb(); | ||
| 34 | |||
| 35 | static part createLiteral(std::string value); | ||
| 36 | |||
| 37 | // Copy and move constructors | ||
| 38 | |||
| 39 | part(const part& other); | ||
| 40 | |||
| 41 | part(part&& other); | ||
| 42 | |||
| 43 | // Assignment | ||
| 44 | |||
| 45 | part& operator=(part other); | ||
| 46 | |||
| 47 | // Swap | ||
| 48 | |||
| 49 | friend void swap(part& first, part& second); | ||
| 50 | |||
| 51 | // Destructor | ||
| 52 | |||
| 53 | ~part(); | ||
| 54 | |||
| 55 | // General accessors | ||
| 56 | |||
| 57 | type getType() const | ||
| 58 | { | ||
| 59 | return type_; | ||
| 60 | } | ||
| 61 | |||
| 62 | // Noun phrase accessors | ||
| 63 | |||
| 64 | std::string getNounRole() const; | ||
| 65 | |||
| 66 | selrestr getNounSelrestrs() const; | ||
| 67 | |||
| 68 | std::set<std::string> getNounSynrestrs() const; | ||
| 69 | |||
| 70 | // Preposition accessors | ||
| 71 | |||
| 72 | std::set<std::string> getPrepositionChoices() const; | ||
| 73 | |||
| 74 | bool isPrepositionLiteral() const; | ||
| 75 | |||
| 76 | // Literal accessors | ||
| 77 | |||
| 78 | std::string getLiteralValue() const; | ||
| 79 | |||
| 80 | private: | ||
| 81 | |||
| 82 | // Private constructors | ||
| 83 | |||
| 84 | part() | ||
| 85 | { | ||
| 86 | } | ||
| 87 | |||
| 88 | part(type t) : type_(t) | ||
| 89 | { | ||
| 90 | } | ||
| 91 | |||
| 92 | // Data | ||
| 93 | |||
| 94 | union { | ||
| 95 | struct { | ||
| 96 | std::string role; | ||
| 97 | selrestr selrestrs; | ||
| 98 | std::set<std::string> synrestrs; | ||
| 99 | } noun_phrase_; | ||
| 100 | struct { | ||
| 101 | std::set<std::string> choices; | ||
| 102 | bool literal; | ||
| 103 | } preposition_; | ||
| 104 | std::string literal_; | ||
| 105 | }; | ||
| 106 | |||
| 107 | type type_ = type::invalid; | ||
| 108 | |||
| 109 | }; | ||
| 110 | |||
| 111 | }; | ||
| 112 | }; | ||
| 113 | |||
| 114 | #endif /* end of include guard: PART_H_FB54F361 */ | ||
| diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h | |||
| @@ -3,48 +3,54 @@ | |||
| 3 | 3 | ||
| 4 | #include <string> | 4 | #include <string> |
| 5 | 5 | ||
| 6 | class progress { | 6 | namespace verbly { |
| 7 | private: | 7 | namespace generator { |
| 8 | std::string message; | ||
| 9 | int total; | ||
| 10 | int cur = 0; | ||
| 11 | int lprint = 0; | ||
| 12 | 8 | ||
| 13 | public: | 9 | class progress { |
| 14 | progress(std::string message, int total) : message(message), total(total) | 10 | private: |
| 15 | { | 11 | std::string message; |
| 16 | std::cout << message << " 0%" << std::flush; | 12 | int total; |
| 17 | } | 13 | int cur = 0; |
| 14 | int lprint = 0; | ||
| 18 | 15 | ||
| 19 | void update(int val) | 16 | public: |
| 20 | { | 17 | progress(std::string message, int total) : message(message), total(total) |
| 21 | if (val <= total) | 18 | { |
| 22 | { | 19 | std::cout << message << " 0%" << std::flush; |
| 23 | cur = val; | 20 | } |
| 24 | } else { | 21 | |
| 25 | cur = total; | 22 | void update(int val) |
| 26 | } | 23 | { |
| 24 | if (val <= total) | ||
| 25 | { | ||
| 26 | cur = val; | ||
| 27 | } else { | ||
| 28 | cur = total; | ||
| 29 | } | ||
| 27 | 30 | ||
| 28 | int pp = cur * 100 / total; | 31 | int pp = cur * 100 / total; |
| 29 | if (pp != lprint) | 32 | if (pp != lprint) |
| 30 | { | 33 | { |
| 31 | lprint = pp; | 34 | lprint = pp; |
| 32 | 35 | ||
| 33 | std::cout << "\b\b\b\b" << std::right; | 36 | std::cout << "\b\b\b\b" << std::right; |
| 34 | std::cout.width(3); | 37 | std::cout.width(3); |
| 35 | std::cout << pp << "%" << std::flush; | 38 | std::cout << pp << "%" << std::flush; |
| 36 | } | 39 | } |
| 37 | } | 40 | } |
| 41 | |||
| 42 | void update() | ||
| 43 | { | ||
| 44 | update(cur+1); | ||
| 45 | } | ||
| 38 | 46 | ||
| 39 | void update() | 47 | ~progress() |
| 40 | { | 48 | { |
| 41 | update(cur+1); | 49 | std::cout << "\b\b\b\b100%" << std::endl; |
| 42 | } | 50 | } |
| 51 | }; | ||
| 43 | 52 | ||
| 44 | ~progress() | 53 | }; |
| 45 | { | ||
| 46 | std::cout << "\b\b\b\b100%" << std::endl; | ||
| 47 | } | ||
| 48 | }; | 54 | }; |
| 49 | 55 | ||
| 50 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ | 56 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ |
| diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp | |||
| @@ -0,0 +1,87 @@ | |||
| 1 | #include "pronunciation.h" | ||
| 2 | #include <list> | ||
| 3 | #include <algorithm> | ||
| 4 | #include <cctype> | ||
| 5 | #include <iterator> | ||
| 6 | #include "database.h" | ||
| 7 | #include "field.h" | ||
| 8 | #include "../lib/util.h" | ||
| 9 | |||
| 10 | namespace verbly { | ||
| 11 | namespace generator { | ||
| 12 | |||
| 13 | int pronunciation::nextId_ = 0; | ||
| 14 | |||
| 15 | pronunciation::pronunciation(std::string phonemes) : | ||
| 16 | id_(nextId_++), | ||
| 17 | phonemes_(phonemes) | ||
| 18 | { | ||
| 19 | auto phonemeList = split<std::list<std::string>>(phonemes, " "); | ||
| 20 | |||
| 21 | auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) { | ||
| 22 | return phoneme.find("1") != std::string::npos; | ||
| 23 | }); | ||
| 24 | |||
| 25 | // Rhyme detection | ||
| 26 | if (rhymeStart != std::end(phonemeList)) | ||
| 27 | { | ||
| 28 | std::list<std::string> rhymePhonemes; | ||
| 29 | |||
| 30 | std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) { | ||
| 31 | std::string naked; | ||
| 32 | |||
| 33 | std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) { | ||
| 34 | return std::isdigit(ch); | ||
| 35 | }); | ||
| 36 | |||
| 37 | return naked; | ||
| 38 | }); | ||
| 39 | |||
| 40 | rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); | ||
| 41 | |||
| 42 | if (rhymeStart != std::begin(phonemeList)) | ||
| 43 | { | ||
| 44 | prerhyme_ = *std::prev(rhymeStart); | ||
| 45 | } | ||
| 46 | } | ||
| 47 | |||
| 48 | // Syllable/stress | ||
| 49 | for (std::string phoneme : phonemeList) | ||
| 50 | { | ||
| 51 | if (std::isdigit(phoneme.back())) | ||
| 52 | { | ||
| 53 | // It's a vowel! | ||
| 54 | syllables_++; | ||
| 55 | |||
| 56 | if (phoneme.back() == '1') | ||
| 57 | { | ||
| 58 | stress_.push_back('1'); | ||
| 59 | } else { | ||
| 60 | stress_.push_back('0'); | ||
| 61 | } | ||
| 62 | } | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | database& operator<<(database& db, const pronunciation& arg) | ||
| 67 | { | ||
| 68 | std::list<field> fields; | ||
| 69 | |||
| 70 | fields.emplace_back("pronunciation_id", arg.getId()); | ||
| 71 | fields.emplace_back("phonemes", arg.getPhonemes()); | ||
| 72 | fields.emplace_back("syllables", arg.getSyllables()); | ||
| 73 | fields.emplace_back("stress", arg.getStress()); | ||
| 74 | |||
| 75 | if (arg.hasRhyme()) | ||
| 76 | { | ||
| 77 | fields.emplace_back("rhyme", arg.getRhymePhonemes()); | ||
| 78 | fields.emplace_back("prerhyme", arg.getPrerhyme()); | ||
| 79 | } | ||
| 80 | |||
| 81 | db.insertIntoTable("pronunciations", std::move(fields)); | ||
| 82 | |||
| 83 | return db; | ||
| 84 | } | ||
| 85 | |||
| 86 | }; | ||
| 87 | }; | ||
| diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h | |||
| @@ -0,0 +1,82 @@ | |||
| 1 | #ifndef PRONUNCIATION_H_584A08DD | ||
| 2 | #define PRONUNCIATION_H_584A08DD | ||
| 3 | |||
| 4 | #include <string> | ||
| 5 | #include <cassert> | ||
| 6 | |||
| 7 | namespace verbly { | ||
| 8 | namespace generator { | ||
| 9 | |||
| 10 | class database; | ||
| 11 | |||
| 12 | class pronunciation { | ||
| 13 | public: | ||
| 14 | |||
| 15 | // Constructor | ||
| 16 | |||
| 17 | explicit pronunciation(std::string phonemes); | ||
| 18 | |||
| 19 | // Accessors | ||
| 20 | |||
| 21 | int getId() const | ||
| 22 | { | ||
| 23 | return id_; | ||
| 24 | } | ||
| 25 | |||
| 26 | std::string getPhonemes() const | ||
| 27 | { | ||
| 28 | return phonemes_; | ||
| 29 | } | ||
| 30 | |||
| 31 | bool hasRhyme() const | ||
| 32 | { | ||
| 33 | return !rhyme_.empty(); | ||
| 34 | } | ||
| 35 | |||
| 36 | std::string getRhymePhonemes() const | ||
| 37 | { | ||
| 38 | // Calling code should always call hasRhyme first. | ||
| 39 | assert(!rhyme_.empty()); | ||
| 40 | |||
| 41 | return rhyme_; | ||
| 42 | } | ||
| 43 | |||
| 44 | std::string getPrerhyme() const | ||
| 45 | { | ||
| 46 | // Calling code should always call hasRhyme first. | ||
| 47 | assert(!rhyme_.empty()); | ||
| 48 | |||
| 49 | return prerhyme_; | ||
| 50 | } | ||
| 51 | |||
| 52 | int getSyllables() const | ||
| 53 | { | ||
| 54 | return syllables_; | ||
| 55 | } | ||
| 56 | |||
| 57 | std::string getStress() const | ||
| 58 | { | ||
| 59 | return stress_; | ||
| 60 | } | ||
| 61 | |||
| 62 | private: | ||
| 63 | |||
| 64 | static int nextId_; | ||
| 65 | |||
| 66 | const int id_; | ||
| 67 | const std::string phonemes_; | ||
| 68 | std::string rhyme_; | ||
| 69 | std::string prerhyme_; | ||
| 70 | int syllables_ = 0; | ||
| 71 | std::string stress_; | ||
| 72 | |||
| 73 | }; | ||
| 74 | |||
| 75 | // Serializer | ||
| 76 | |||
| 77 | database& operator<<(database& db, const pronunciation& arg); | ||
| 78 | |||
| 79 | }; | ||
| 80 | }; | ||
| 81 | |||
| 82 | #endif /* end of include guard: PRONUNCIATION_H_584A08DD */ | ||
| diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | #ifndef ROLE_H_249F9A9C | ||
| 2 | #define ROLE_H_249F9A9C | ||
| 3 | |||
| 4 | #include "selrestr.h" | ||
| 5 | |||
| 6 | namespace verbly { | ||
| 7 | namespace generator { | ||
| 8 | |||
| 9 | class role { | ||
| 10 | public: | ||
| 11 | |||
| 12 | // Mutators | ||
| 13 | |||
| 14 | void setSelrestrs(selrestr selrestrs) | ||
| 15 | { | ||
| 16 | selrestrs_ = selrestrs; | ||
| 17 | } | ||
| 18 | |||
| 19 | // Accessors | ||
| 20 | |||
| 21 | const selrestr& getSelrestrs() const | ||
| 22 | { | ||
| 23 | return selrestrs_; | ||
| 24 | } | ||
| 25 | |||
| 26 | private: | ||
| 27 | |||
| 28 | selrestr selrestrs_; | ||
| 29 | |||
| 30 | }; | ||
| 31 | |||
| 32 | }; | ||
| 33 | }; | ||
| 34 | |||
| 35 | #endif /* end of include guard: ROLE_H_249F9A9C */ | ||
| diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
| @@ -1,286 +1,204 @@ | |||
| 1 | DROP TABLE IF EXISTS `verbs`; | 1 | CREATE TABLE `notions` ( |
| 2 | CREATE TABLE `verbs` ( | 2 | `notion_id` INTEGER PRIMARY KEY, |
| 3 | `verb_id` INTEGER PRIMARY KEY, | 3 | `part_of_speech` SMALLINT NOT NULL, |
| 4 | `infinitive` VARCHAR(32) NOT NULL, | 4 | `wnid` INTEGER, |
| 5 | `past_tense` VARCHAR(32) NOT NULL, | 5 | `images` INTEGER |
| 6 | `past_participle` VARCHAR(32) NOT NULL, | ||
| 7 | `ing_form` VARCHAR(32) NOT NULL, | ||
| 8 | `s_form` VARCHAR(32) NOT NULL | ||
| 9 | ); | 6 | ); |
| 10 | 7 | ||
| 11 | DROP TABLE IF EXISTS `groups`; | 8 | CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`); |
| 12 | CREATE TABLE `groups` ( | ||
| 13 | `group_id` INTEGER PRIMARY KEY, | ||
| 14 | `data` BLOB NOT NULL | ||
| 15 | ); | ||
| 16 | |||
| 17 | DROP TABLE IF EXISTS `frames`; | ||
| 18 | CREATE TABLE `frames` ( | ||
| 19 | `frame_id` INTEGER PRIMARY KEY, | ||
| 20 | `group_id` INTEGER NOT NULL, | ||
| 21 | `data` BLOB NOT NULL, | ||
| 22 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
| 23 | ); | ||
| 24 | 9 | ||
| 25 | DROP TABLE IF EXISTS `verb_groups`; | ||
| 26 | CREATE TABLE `verb_groups` ( | ||
| 27 | `verb_id` INTEGER NOT NULL, | ||
| 28 | `group_id` INTEGER NOT NULL, | ||
| 29 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`), | ||
| 30 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
| 31 | ); | ||
| 32 | |||
| 33 | DROP TABLE IF EXISTS `adjectives`; | ||
| 34 | CREATE TABLE `adjectives` ( | ||
| 35 | `adjective_id` INTEGER PRIMARY KEY, | ||
| 36 | `base_form` VARCHAR(32) NOT NULL, | ||
| 37 | `comparative` VARCHAR(32), | ||
| 38 | `superlative` VARCHAR(32), | ||
| 39 | `position` CHAR(1), | ||
| 40 | `complexity` INTEGER NOT NULL | ||
| 41 | ); | ||
| 42 | |||
| 43 | DROP TABLE IF EXISTS `adverbs`; | ||
| 44 | CREATE TABLE `adverbs` ( | ||
| 45 | `adverb_id` INTEGER PRIMARY KEY, | ||
| 46 | `base_form` VARCHAR(32) NOT NULL, | ||
| 47 | `comparative` VARCHAR(32), | ||
| 48 | `superlative` VARCHAR(32), | ||
| 49 | `complexity` INTEGER NOT NULL | ||
| 50 | ); | ||
| 51 | |||
| 52 | DROP TABLE IF EXISTS `nouns`; | ||
| 53 | CREATE TABLE `nouns` ( | ||
| 54 | `noun_id` INTEGER PRIMARY KEY, | ||
| 55 | `singular` VARCHAR(32) NOT NULL, | ||
| 56 | `plural` VARCHAR(32), | ||
| 57 | `proper` INTEGER(1) NOT NULL, | ||
| 58 | `complexity` INTEGER NOT NULL, | ||
| 59 | `images` INTEGER NOT NULL, | ||
| 60 | `wnid` INTEGER NOT NULL | ||
| 61 | ); | ||
| 62 | |||
| 63 | DROP TABLE IF EXISTS `hypernymy`; | ||
| 64 | CREATE TABLE `hypernymy` ( | 10 | CREATE TABLE `hypernymy` ( |
| 65 | `hypernym_id` INTEGER NOT NULL, | 11 | `hypernym_id` INTEGER NOT NULL, |
| 66 | `hyponym_id` INTEGER NOT NULL, | 12 | `hyponym_id` INTEGER NOT NULL |
| 67 | FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`), | ||
| 68 | FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`) | ||
| 69 | ); | 13 | ); |
| 70 | 14 | ||
| 71 | DROP TABLE IF EXISTS `instantiation`; | 15 | CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`); |
| 16 | CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`); | ||
| 17 | |||
| 72 | CREATE TABLE `instantiation` ( | 18 | CREATE TABLE `instantiation` ( |
| 73 | `class_id` INTEGER NOT NULL, | 19 | `class_id` INTEGER NOT NULL, |
| 74 | `instance_id` INTEGER NOT NULL, | 20 | `instance_id` INTEGER NOT NULL |
| 75 | FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`), | ||
| 76 | FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`) | ||
| 77 | ); | 21 | ); |
| 78 | 22 | ||
| 79 | DROP TABLE IF EXISTS `member_meronymy`; | 23 | CREATE INDEX `instance_of` ON `instantiation`(`class_id`); |
| 24 | CREATE INDEX `class_of` ON `instantiation`(`instance_id`); | ||
| 25 | |||
| 80 | CREATE TABLE `member_meronymy` ( | 26 | CREATE TABLE `member_meronymy` ( |
| 81 | `meronym_id` INTEGER NOT NULL, | 27 | `meronym_id` INTEGER NOT NULL, |
| 82 | `holonym_id` INTEGER NOT NULL, | 28 | `holonym_id` INTEGER NOT NULL |
| 83 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
| 84 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
| 85 | ); | 29 | ); |
| 86 | 30 | ||
| 87 | DROP TABLE IF EXISTS `part_meronymy`; | 31 | CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`); |
| 32 | CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`); | ||
| 33 | |||
| 88 | CREATE TABLE `part_meronymy` ( | 34 | CREATE TABLE `part_meronymy` ( |
| 89 | `meronym_id` INTEGER NOT NULL, | 35 | `meronym_id` INTEGER NOT NULL, |
| 90 | `holonym_id` INTEGER NOT NULL, | 36 | `holonym_id` INTEGER NOT NULL |
| 91 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
| 92 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
| 93 | ); | 37 | ); |
| 94 | 38 | ||
| 95 | DROP TABLE IF EXISTS `substance_meronymy`; | 39 | CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`); |
| 40 | CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`); | ||
| 41 | |||
| 96 | CREATE TABLE `substance_meronymy` ( | 42 | CREATE TABLE `substance_meronymy` ( |
| 97 | `meronym_id` INTEGER NOT NULL, | 43 | `meronym_id` INTEGER NOT NULL, |
| 98 | `holonym_id` INTEGER NOT NULL, | 44 | `holonym_id` INTEGER NOT NULL |
| 99 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
| 100 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
| 101 | ); | 45 | ); |
| 102 | 46 | ||
| 103 | DROP TABLE IF EXISTS `variation`; | 47 | CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`); |
| 48 | CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`); | ||
| 49 | |||
| 104 | CREATE TABLE `variation` ( | 50 | CREATE TABLE `variation` ( |
| 105 | `noun_id` INTEGER NOT NULL, | 51 | `noun_id` INTEGER NOT NULL, |
| 106 | `adjective_id` INTEGER NOT NULL, | 52 | `adjective_id` INTEGER NOT NULL |
| 107 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
| 108 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 109 | ); | 53 | ); |
| 110 | 54 | ||
| 111 | DROP TABLE IF EXISTS `noun_antonymy`; | 55 | CREATE INDEX `variant_of` ON `variation`(`noun_id`); |
| 112 | CREATE TABLE `noun_antonymy` ( | 56 | CREATE INDEX `attribute_of` ON `variation`(`adjective_id`); |
| 113 | `noun_1_id` INTEGER NOT NULL, | ||
| 114 | `noun_2_id` INTEGER NOT NULL, | ||
| 115 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | ||
| 116 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
| 117 | ); | ||
| 118 | 57 | ||
| 119 | DROP TABLE IF EXISTS `adjective_antonymy`; | 58 | CREATE TABLE `similarity` ( |
| 120 | CREATE TABLE `adjective_antonymy` ( | ||
| 121 | `adjective_1_id` INTEGER NOT NULL, | 59 | `adjective_1_id` INTEGER NOT NULL, |
| 122 | `adjective_2_id` INTEGER NOT NULL, | 60 | `adjective_2_id` INTEGER NOT NULL |
| 123 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 61 | ); |
| 124 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 62 | |
| 63 | CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`); | ||
| 64 | |||
| 65 | CREATE TABLE `is_a` ( | ||
| 66 | `notion_id` INTEGER NOT NULL, | ||
| 67 | `groupname` VARCHAR(32) NOT NULL | ||
| 125 | ); | 68 | ); |
| 126 | 69 | ||
| 127 | DROP TABLE IF EXISTS `adverb_antonymy`; | 70 | CREATE TABLE `entailment` ( |
| 128 | CREATE TABLE `adverb_antonymy` ( | 71 | `given_id` INTEGER NOT NULL, |
| 129 | `adverb_1_id` INTEGER NOT NULL, | 72 | `entailment_id` INTEGER NOT NULL |
| 130 | `adverb_2_id` INTEGER NOT NULL, | 73 | ); |
| 131 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 74 | |
| 132 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 75 | CREATE INDEX `entailment_of` ON `entailment`(`given_id`); |
| 76 | CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`); | ||
| 77 | |||
| 78 | CREATE TABLE `causality` ( | ||
| 79 | `cause_id` INTEGER NOT NULL, | ||
| 80 | `effect_id` INTEGER NOT NULL | ||
| 81 | ); | ||
| 82 | |||
| 83 | CREATE INDEX `effect_of` ON `causality`(`cause_id`); | ||
| 84 | CREATE INDEX `cause_of` ON `causality`(`effect_id`); | ||
| 85 | |||
| 86 | CREATE TABLE `words` ( | ||
| 87 | `word_id` INTEGER PRIMARY KEY, | ||
| 88 | `notion_id` INTEGER NOT NULL, | ||
| 89 | `lemma_id` INTEGER NOT NULL, | ||
| 90 | `tag_count` INTEGER, | ||
| 91 | `position` SMALLINT, | ||
| 92 | `group_id` INTEGER | ||
| 93 | ); | ||
| 94 | |||
| 95 | CREATE INDEX `notion_words` ON `words`(`notion_id`); | ||
| 96 | CREATE INDEX `lemma_words` ON `words`(`lemma_id`); | ||
| 97 | CREATE INDEX `group_words` ON `words`(`group_id`); | ||
| 98 | |||
| 99 | CREATE TABLE `antonymy` ( | ||
| 100 | `antonym_1_id` INTEGER NOT NULL, | ||
| 101 | `antonym_2_id` INTEGER NOT NULL | ||
| 133 | ); | 102 | ); |
| 134 | 103 | ||
| 135 | DROP TABLE IF EXISTS `specification`; | 104 | CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`); |
| 105 | |||
| 136 | CREATE TABLE `specification` ( | 106 | CREATE TABLE `specification` ( |
| 137 | `general_id` INTEGER NOT NULL, | 107 | `general_id` INTEGER NOT NULL, |
| 138 | `specific_id` INTEGER NOT NULL, | 108 | `specific_id` INTEGER NOT NULL |
| 139 | FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`), | ||
| 140 | FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 141 | ); | 109 | ); |
| 142 | 110 | ||
| 143 | DROP TABLE IF EXISTS `pertainymy`; | 111 | CREATE INDEX `specification_of` ON `specification`(`general_id`); |
| 112 | CREATE INDEX `generalization_of` ON `specification`(`specific_id`); | ||
| 113 | |||
| 144 | CREATE TABLE `pertainymy` ( | 114 | CREATE TABLE `pertainymy` ( |
| 145 | `noun_id` INTEGER NOT NULL, | 115 | `noun_id` INTEGER NOT NULL, |
| 146 | `pertainym_id` INTEGER NOT NULL, | 116 | `pertainym_id` INTEGER NOT NULL |
| 147 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
| 148 | FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 149 | ); | 117 | ); |
| 150 | 118 | ||
| 151 | DROP TABLE IF EXISTS `mannernymy`; | 119 | CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`); |
| 120 | CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`); | ||
| 121 | |||
| 152 | CREATE TABLE `mannernymy` ( | 122 | CREATE TABLE `mannernymy` ( |
| 153 | `adjective_id` INTEGER NOT NULL, | 123 | `adjective_id` INTEGER NOT NULL, |
| 154 | `mannernym_id` INTEGER NOT NULL, | 124 | `mannernym_id` INTEGER NOT NULL |
| 155 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
| 156 | FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`) | ||
| 157 | ); | 125 | ); |
| 158 | 126 | ||
| 159 | DROP TABLE IF EXISTS `noun_synonymy`; | 127 | CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`); |
| 160 | CREATE TABLE `noun_synonymy` ( | 128 | CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`); |
| 161 | `noun_1_id` INTEGER NOT NULL, | ||
| 162 | `noun_2_id` INTEGER NOT NULL, | ||
| 163 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`), | ||
| 164 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`) | ||
| 165 | ); | ||
| 166 | 129 | ||
| 167 | DROP TABLE IF EXISTS `adjective_synonymy`; | 130 | CREATE TABLE `usage` ( |
| 168 | CREATE TABLE `adjective_synonymy` ( | 131 | `domain_id` INTEGER NOT NULL, |
| 169 | `adjective_1_id` INTEGER NOT NULL, | 132 | `term_id` INTEGER NOT NULL |
| 170 | `adjective_2_id` INTEGER NOT NULL, | ||
| 171 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | ||
| 172 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 173 | ); | 133 | ); |
| 174 | 134 | ||
| 175 | DROP TABLE IF EXISTS `adverb_synonymy`; | 135 | CREATE INDEX `usage_term_of` ON `usage`(`domain_id`); |
| 176 | CREATE TABLE `adverb_synonymy` ( | 136 | CREATE INDEX `usage_domain_of` ON `usage`(`term_id`); |
| 177 | `adverb_1_id` INTEGER NOT NULL, | ||
| 178 | `adverb_2_id` INTEGER NOT NULL, | ||
| 179 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | ||
| 180 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | ||
| 181 | ); | ||
| 182 | 137 | ||
| 183 | DROP TABLE IF EXISTS `noun_pronunciations`; | 138 | CREATE TABLE `topicality` ( |
| 184 | CREATE TABLE `noun_pronunciations` ( | 139 | `domain_id` INTEGER NOT NULL, |
| 185 | `noun_id` INTEGER NOT NULL, | 140 | `term_id` INTEGER NOT NULL |
| 186 | `pronunciation` VARCHAR(64) NOT NULL, | ||
| 187 | `prerhyme` VARCHAR(8), | ||
| 188 | `rhyme` VARCHAR(64), | ||
| 189 | `syllables` INT NOT NULL, | ||
| 190 | `stress` VARCHAR(64) NOT NULL, | ||
| 191 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`) | ||
| 192 | ); | 141 | ); |
| 193 | 142 | ||
| 194 | DROP TABLE IF EXISTS `verb_pronunciations`; | 143 | CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`); |
| 195 | CREATE TABLE `verb_pronunciations` ( | 144 | CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`); |
| 196 | `verb_id` INTEGER NOT NULL, | ||
| 197 | `pronunciation` VARCHAR(64) NOT NULL, | ||
| 198 | `prerhyme` VARCHAR(8), | ||
| 199 | `rhyme` VARCHAR(64), | ||
| 200 | `syllables` INT NOT NULL, | ||
| 201 | `stress` VARCHAR(64) NOT NULL, | ||
| 202 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`) | ||
| 203 | ); | ||
| 204 | 145 | ||
| 205 | DROP TABLE IF EXISTS `adjective_pronunciations`; | 146 | CREATE TABLE `regionality` ( |
| 206 | CREATE TABLE `adjective_pronunciations` ( | 147 | `domain_id` INTEGER NOT NULL, |
| 207 | `adjective_id` INTEGER NOT NULL, | 148 | `term_id` INTEGER NOT NULL |
| 208 | `pronunciation` VARCHAR(64) NOT NULL, | ||
| 209 | `prerhyme` VARCHAR(8), | ||
| 210 | `rhyme` VARCHAR(64), | ||
| 211 | `syllables` INT NOT NULL, | ||
| 212 | `stress` VARCHAR(64) NOT NULL, | ||
| 213 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 214 | ); | 149 | ); |
| 215 | 150 | ||
| 216 | DROP TABLE IF EXISTS `adverb_pronunciations`; | 151 | CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`); |
| 217 | CREATE TABLE `adverb_pronunciations` ( | 152 | CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`); |
| 218 | `adverb_id` INTEGER NOT NULL, | ||
| 219 | `pronunciation` VARCHAR(64) NOT NULL, | ||
| 220 | `prerhyme` VARCHAR(8), | ||
| 221 | `rhyme` VARCHAR(64), | ||
| 222 | `syllables` INT NOT NULL, | ||
| 223 | `stress` VARCHAR(64) NOT NULL, | ||
| 224 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
| 225 | ); | ||
| 226 | 153 | ||
| 227 | DROP TABLE IF EXISTS `noun_noun_derivation`; | 154 | CREATE TABLE `forms` ( |
| 228 | CREATE TABLE `noun_noun_derivation` ( | 155 | `form_id` INTEGER PRIMARY KEY, |
| 229 | `noun_1_id` INTEGER NOT NULL, | 156 | `form` VARCHAR(32) NOT NULL, |
| 230 | `noun_2_id` INTEGER NOT NULL, | 157 | `complexity` SMALLINT NOT NULL, |
| 231 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | 158 | `proper` SMALLINT NOT NULL |
| 232 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
| 233 | ); | 159 | ); |
| 234 | 160 | ||
| 235 | DROP TABLE IF EXISTS `noun_adjective_derivation`; | 161 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); |
| 236 | CREATE TABLE `noun_adjective_derivation` ( | ||
| 237 | `noun_id` INTEGER NOT NULL, | ||
| 238 | `adjective_id` INTEGER NOT NULL, | ||
| 239 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
| 240 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
| 241 | ); | ||
| 242 | 162 | ||
| 243 | DROP TABLE IF EXISTS `noun_adverb_derivation`; | 163 | CREATE TABLE `lemmas_forms` ( |
| 244 | CREATE TABLE `noun_adverb_derivation` ( | 164 | `lemma_id` INTEGER NOT NULL, |
| 245 | `noun_id` INTEGER NOT NULL, | 165 | `form_id` INTEGER NOT NULL, |
| 246 | `adverb_id` INTEGER NOT NULL, | 166 | `category` SMALLINT NOT NULL |
| 247 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
| 248 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
| 249 | ); | 167 | ); |
| 250 | 168 | ||
| 251 | DROP TABLE IF EXISTS `adjective_adjective_derivation`; | 169 | CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`); |
| 252 | CREATE TABLE `adjective_adjective_derivation` ( | 170 | CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`); |
| 253 | `adjective_1_id` INTEGER NOT NULL, | 171 | |
| 254 | `adjective_2_id` INTEGER NOT NULL, | 172 | CREATE TABLE `pronunciations` ( |
| 255 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 173 | `pronunciation_id` INTEGER PRIMARY KEY, |
| 256 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 174 | `phonemes` VARCHAR(64) NOT NULL, |
| 175 | `prerhyme` VARCHAR(8), | ||
| 176 | `rhyme` VARCHAR(64), | ||
| 177 | `syllables` INTEGER NOT NULL, | ||
| 178 | `stress` VARCHAR(64) NOT NULL | ||
| 257 | ); | 179 | ); |
| 258 | 180 | ||
| 259 | DROP TABLE IF EXISTS `adjective_adverb_derivation`; | 181 | CREATE TABLE `forms_pronunciations` ( |
| 260 | CREATE TABLE `adjective_adverb_derivation` ( | 182 | `form_id` INTEGER NOT NULL, |
| 261 | `adjective_id` INTEGER NOT NULL, | 183 | `pronunciation_id` INTEGER NOT NULL |
| 262 | `adverb_id` INTEGER NOT NULL, | ||
| 263 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
| 264 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`) | ||
| 265 | ); | 184 | ); |
| 266 | 185 | ||
| 267 | DROP TABLE IF EXISTS `adverb_adverb_derivation`; | 186 | CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`); |
| 268 | CREATE TABLE `adverb_adverb_derivation` ( | 187 | CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`); |
| 269 | `adverb_1_id` INTEGER NOT NULL, | 188 | |
| 270 | `adverb_2_id` INTEGER NOT NULL, | 189 | CREATE TABLE `groups` ( |
| 271 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 190 | `group_id` INTEGER PRIMARY KEY, |
| 272 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 191 | `data` BLOB NOT NULL |
| 273 | ); | 192 | ); |
| 274 | 193 | ||
| 275 | DROP TABLE IF EXISTS `prepositions`; | 194 | CREATE TABLE `frames` ( |
| 276 | CREATE TABLE `prepositions` ( | 195 | `frame_id` INTEGER PRIMARY KEY, |
| 277 | `preposition_id` INTEGER PRIMARY KEY, | 196 | `data` BLOB NOT NULL |
| 278 | `form` VARCHAR(32) NOT NULL | ||
| 279 | ); | 197 | ); |
| 280 | 198 | ||
| 281 | DROP TABLE IF EXISTS `preposition_groups`; | 199 | CREATE TABLE `groups_frames` ( |
| 282 | CREATE TABLE `preposition_groups` ( | 200 | `group_id` INTEGER NOT NULL, |
| 283 | `preposition_id` INTEGER NOT NULL, | 201 | `frame_id` INTEGER NOT NULL |
| 284 | `groupname` VARCHAR(32) NOT NULL, | ||
| 285 | FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`) | ||
| 286 | ); | 202 | ); |
| 203 | |||
| 204 | CREATE INDEX `frames_in` ON `groups_frames`(`group_id`); | ||
| diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp | |||
| @@ -0,0 +1,288 @@ | |||
| 1 | #include "selrestr.h" | ||
| 2 | |||
| 3 | namespace verbly { | ||
| 4 | namespace generator { | ||
| 5 | |||
| 6 | selrestr::selrestr(const selrestr& other) | ||
| 7 | { | ||
| 8 | type_ = other.type_; | ||
| 9 | |||
| 10 | switch (type_) | ||
| 11 | { | ||
| 12 | case type::singleton: | ||
| 13 | { | ||
| 14 | singleton_.pos = other.singleton_.pos; | ||
| 15 | new(&singleton_.restriction) std::string(other.singleton_.restriction); | ||
| 16 | |||
| 17 | break; | ||
| 18 | } | ||
| 19 | |||
| 20 | case type::group: | ||
| 21 | { | ||
| 22 | new(&group_.children) std::list<selrestr>(other.group_.children); | ||
| 23 | group_.orlogic = other.group_.orlogic; | ||
| 24 | |||
| 25 | break; | ||
| 26 | } | ||
| 27 | |||
| 28 | case type::empty: | ||
| 29 | { | ||
| 30 | break; | ||
| 31 | } | ||
| 32 | } | ||
| 33 | } | ||
| 34 | |||
| 35 | selrestr::selrestr(selrestr&& other) : selrestr() | ||
| 36 | { | ||
| 37 | swap(*this, other); | ||
| 38 | } | ||
| 39 | |||
| 40 | selrestr& selrestr::operator=(selrestr other) | ||
| 41 | { | ||
| 42 | swap(*this, other); | ||
| 43 | |||
| 44 | return *this; | ||
| 45 | } | ||
| 46 | |||
| 47 | void swap(selrestr& first, selrestr& second) | ||
| 48 | { | ||
| 49 | using type = selrestr::type; | ||
| 50 | |||
| 51 | type tempType = first.type_; | ||
| 52 | int tempPos; | ||
| 53 | std::string tempRestriction; | ||
| 54 | std::list<selrestr> tempChildren; | ||
| 55 | bool tempOrlogic; | ||
| 56 | |||
| 57 | switch (tempType) | ||
| 58 | { | ||
| 59 | case type::singleton: | ||
| 60 | { | ||
| 61 | tempPos = first.singleton_.pos; | ||
| 62 | tempRestriction = std::move(first.singleton_.restriction); | ||
| 63 | |||
| 64 | break; | ||
| 65 | } | ||
| 66 | |||
| 67 | case type::group: | ||
| 68 | { | ||
| 69 | tempChildren = std::move(first.group_.children); | ||
| 70 | tempOrlogic = first.group_.orlogic; | ||
| 71 | |||
| 72 | break; | ||
| 73 | } | ||
| 74 | |||
| 75 | case type::empty: | ||
| 76 | { | ||
| 77 | break; | ||
| 78 | } | ||
| 79 | } | ||
| 80 | |||
| 81 | first.~selrestr(); | ||
| 82 | |||
| 83 | first.type_ = second.type_; | ||
| 84 | |||
| 85 | switch (first.type_) | ||
| 86 | { | ||
| 87 | case type::singleton: | ||
| 88 | { | ||
| 89 | first.singleton_.pos = second.singleton_.pos; | ||
| 90 | new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction)); | ||
| 91 | |||
| 92 | break; | ||
| 93 | } | ||
| 94 | |||
| 95 | case type::group: | ||
| 96 | { | ||
| 97 | new(&first.group_.children) std::list<selrestr>(std::move(second.group_.children)); | ||
| 98 | first.group_.orlogic = second.group_.orlogic; | ||
| 99 | |||
| 100 | break; | ||
| 101 | } | ||
| 102 | |||
| 103 | case type::empty: | ||
| 104 | { | ||
| 105 | break; | ||
| 106 | } | ||
| 107 | } | ||
| 108 | |||
| 109 | second.~selrestr(); | ||
| 110 | |||
| 111 | second.type_ = tempType; | ||
| 112 | |||
| 113 | switch (second.type_) | ||
| 114 | { | ||
| 115 | case type::singleton: | ||
| 116 | { | ||
| 117 | second.singleton_.pos = tempPos; | ||
| 118 | new(&second.singleton_.restriction) std::string(std::move(tempRestriction)); | ||
| 119 | |||
| 120 | break; | ||
| 121 | } | ||
| 122 | |||
| 123 | case type::group: | ||
| 124 | { | ||
| 125 | new(&second.group_.children) std::list<selrestr>(std::move(tempChildren)); | ||
| 126 | second.group_.orlogic = tempOrlogic; | ||
| 127 | |||
| 128 | break; | ||
| 129 | } | ||
| 130 | |||
| 131 | case type::empty: | ||
| 132 | { | ||
| 133 | break; | ||
| 134 | } | ||
| 135 | } | ||
| 136 | } | ||
| 137 | |||
| 138 | selrestr::~selrestr() | ||
| 139 | { | ||
| 140 | switch (type_) | ||
| 141 | { | ||
| 142 | case type::singleton: | ||
| 143 | { | ||
| 144 | using string_type = std::string; | ||
| 145 | singleton_.restriction.~string_type(); | ||
| 146 | |||
| 147 | break; | ||
| 148 | } | ||
| 149 | |||
| 150 | case type::group: | ||
| 151 | { | ||
| 152 | using list_type = std::list<selrestr>; | ||
| 153 | group_.children.~list_type(); | ||
| 154 | |||
| 155 | break; | ||
| 156 | } | ||
| 157 | |||
| 158 | case type::empty: | ||
| 159 | { | ||
| 160 | break; | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | selrestr::selrestr() : type_(type::empty) | ||
| 166 | { | ||
| 167 | } | ||
| 168 | |||
| 169 | selrestr::selrestr( | ||
| 170 | std::string restriction, | ||
| 171 | bool pos) : | ||
| 172 | type_(type::singleton) | ||
| 173 | { | ||
| 174 | new(&singleton_.restriction) std::string(std::move(restriction)); | ||
| 175 | singleton_.pos = pos; | ||
| 176 | } | ||
| 177 | |||
| 178 | std::string selrestr::getRestriction() const | ||
| 179 | { | ||
| 180 | if (type_ == type::singleton) | ||
| 181 | { | ||
| 182 | return singleton_.restriction; | ||
| 183 | } else { | ||
| 184 | throw std::domain_error("Only singleton selrestrs have restrictions"); | ||
| 185 | } | ||
| 186 | } | ||
| 187 | |||
| 188 | bool selrestr::getPos() const | ||
| 189 | { | ||
| 190 | if (type_ == type::singleton) | ||
| 191 | { | ||
| 192 | return singleton_.pos; | ||
| 193 | } else { | ||
| 194 | throw std::domain_error("Only singleton selrestrs have positivity flags"); | ||
| 195 | } | ||
| 196 | } | ||
| 197 | |||
| 198 | selrestr::selrestr( | ||
| 199 | std::list<selrestr> children, | ||
| 200 | bool orlogic) : | ||
| 201 | type_(type::group) | ||
| 202 | { | ||
| 203 | new(&group_.children) std::list<selrestr>(std::move(children)); | ||
| 204 | group_.orlogic = orlogic; | ||
| 205 | } | ||
| 206 | |||
| 207 | std::list<selrestr> selrestr::getChildren() const | ||
| 208 | { | ||
| 209 | if (type_ == type::group) | ||
| 210 | { | ||
| 211 | return group_.children; | ||
| 212 | } else { | ||
| 213 | throw std::domain_error("Only group selrestrs have children"); | ||
| 214 | } | ||
| 215 | } | ||
| 216 | |||
| 217 | std::list<selrestr>::const_iterator selrestr::begin() const | ||
| 218 | { | ||
| 219 | if (type_ == type::group) | ||
| 220 | { | ||
| 221 | return std::begin(group_.children); | ||
| 222 | } else { | ||
| 223 | throw std::domain_error("Only group selrestrs have children"); | ||
| 224 | } | ||
| 225 | } | ||
| 226 | |||
| 227 | std::list<selrestr>::const_iterator selrestr::end() const | ||
| 228 | { | ||
| 229 | if (type_ == type::group) | ||
| 230 | { | ||
| 231 | return std::end(group_.children); | ||
| 232 | } else { | ||
| 233 | throw std::domain_error("Only group selrestrs have children"); | ||
| 234 | } | ||
| 235 | } | ||
| 236 | |||
| 237 | bool selrestr::getOrlogic() const | ||
| 238 | { | ||
| 239 | if (type_ == type::group) | ||
| 240 | { | ||
| 241 | return group_.orlogic; | ||
| 242 | } else { | ||
| 243 | throw std::domain_error("Only group selrestrs have logic"); | ||
| 244 | } | ||
| 245 | } | ||
| 246 | |||
| 247 | nlohmann::json selrestr::toJson() const | ||
| 248 | { | ||
| 249 | switch (type_) | ||
| 250 | { | ||
| 251 | case type::empty: | ||
| 252 | { | ||
| 253 | return {}; | ||
| 254 | } | ||
| 255 | |||
| 256 | case type::singleton: | ||
| 257 | { | ||
| 258 | return { | ||
| 259 | {"type", singleton_.restriction}, | ||
| 260 | {"pos", singleton_.pos} | ||
| 261 | }; | ||
| 262 | } | ||
| 263 | |||
| 264 | case type::group: | ||
| 265 | { | ||
| 266 | std::string logic; | ||
| 267 | if (group_.orlogic) | ||
| 268 | { | ||
| 269 | logic = "or"; | ||
| 270 | } else { | ||
| 271 | logic = "and"; | ||
| 272 | } | ||
| 273 | |||
| 274 | std::list<nlohmann::json> children; | ||
| 275 | std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) { | ||
| 276 | return child.toJson(); | ||
| 277 | }); | ||
| 278 | |||
| 279 | return { | ||
| 280 | {"logic", logic}, | ||
| 281 | {"children", children} | ||
| 282 | }; | ||
| 283 | } | ||
| 284 | } | ||
| 285 | } | ||
| 286 | |||
| 287 | }; | ||
| 288 | }; | ||
| diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h | |||
| @@ -0,0 +1,88 @@ | |||
| 1 | #ifndef SELRESTR_H_50652FB7 | ||
| 2 | #define SELRESTR_H_50652FB7 | ||
| 3 | |||
| 4 | #include <list> | ||
| 5 | #include <string> | ||
| 6 | #include <json.hpp> | ||
| 7 | |||
| 8 | namespace verbly { | ||
| 9 | namespace generator { | ||
| 10 | |||
| 11 | class selrestr { | ||
| 12 | public: | ||
| 13 | enum class type { | ||
| 14 | empty, | ||
| 15 | singleton, | ||
| 16 | group | ||
| 17 | }; | ||
| 18 | |||
| 19 | // Copy and move constructors | ||
| 20 | |||
| 21 | selrestr(const selrestr& other); | ||
| 22 | selrestr(selrestr&& other); | ||
| 23 | |||
| 24 | // Assignment | ||
| 25 | |||
| 26 | selrestr& operator=(selrestr other); | ||
| 27 | |||
| 28 | // Swap | ||
| 29 | |||
| 30 | friend void swap(selrestr& first, selrestr& second); | ||
| 31 | |||
| 32 | // Destructor | ||
| 33 | |||
| 34 | ~selrestr(); | ||
| 35 | |||
| 36 | // Generic accessors | ||
| 37 | |||
| 38 | type getType() const | ||
| 39 | { | ||
| 40 | return type_; | ||
| 41 | } | ||
| 42 | |||
| 43 | // Empty | ||
| 44 | |||
| 45 | selrestr(); | ||
| 46 | |||
| 47 | // Singleton | ||
| 48 | |||
| 49 | selrestr(std::string restriction, bool pos); | ||
| 50 | |||
| 51 | std::string getRestriction() const; | ||
| 52 | |||
| 53 | bool getPos() const; | ||
| 54 | |||
| 55 | // Group | ||
| 56 | |||
| 57 | selrestr(std::list<selrestr> children, bool orlogic); | ||
| 58 | |||
| 59 | std::list<selrestr> getChildren() const; | ||
| 60 | |||
| 61 | std::list<selrestr>::const_iterator begin() const; | ||
| 62 | |||
| 63 | std::list<selrestr>::const_iterator end() const; | ||
| 64 | |||
| 65 | bool getOrlogic() const; | ||
| 66 | |||
| 67 | // Helpers | ||
| 68 | |||
| 69 | nlohmann::json toJson() const; | ||
| 70 | |||
| 71 | private: | ||
| 72 | union { | ||
| 73 | struct { | ||
| 74 | bool pos; | ||
| 75 | std::string restriction; | ||
| 76 | } singleton_; | ||
| 77 | struct { | ||
| 78 | std::list<selrestr> children; | ||
| 79 | bool orlogic; | ||
| 80 | } group_; | ||
| 81 | }; | ||
| 82 | type type_; | ||
| 83 | }; | ||
| 84 | |||
| 85 | }; | ||
| 86 | }; | ||
| 87 | |||
| 88 | #endif /* end of include guard: SELRESTR_H_50652FB7 */ | ||
| diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp | |||
| @@ -0,0 +1,77 @@ | |||
| 1 | #include "word.h" | ||
| 2 | #include <list> | ||
| 3 | #include <string> | ||
| 4 | #include "database.h" | ||
| 5 | #include "notion.h" | ||
| 6 | #include "lemma.h" | ||
| 7 | #include "field.h" | ||
| 8 | #include "group.h" | ||
| 9 | |||
| 10 | namespace verbly { | ||
| 11 | namespace generator { | ||
| 12 | |||
| 13 | int word::nextId_ = 0; | ||
| 14 | |||
| 15 | word::word( | ||
| 16 | notion& n, | ||
| 17 | lemma& l) : | ||
| 18 | id_(nextId_++), | ||
| 19 | notion_(n), | ||
| 20 | lemma_(l) | ||
| 21 | { | ||
| 22 | } | ||
| 23 | |||
| 24 | word::word( | ||
| 25 | notion& n, | ||
| 26 | lemma& l, | ||
| 27 | int tagCount) : | ||
| 28 | id_(nextId_++), | ||
| 29 | notion_(n), | ||
| 30 | lemma_(l), | ||
| 31 | tagCount_(tagCount), | ||
| 32 | hasTagCount_(true) | ||
| 33 | { | ||
| 34 | } | ||
| 35 | |||
| 36 | void word::setAdjectivePosition(positioning adjectivePosition) | ||
| 37 | { | ||
| 38 | adjectivePosition_ = adjectivePosition; | ||
| 39 | } | ||
| 40 | |||
| 41 | void word::setVerbGroup(const group& verbGroup) | ||
| 42 | { | ||
| 43 | verbGroup_ = &verbGroup; | ||
| 44 | } | ||
| 45 | |||
| 46 | database& operator<<(database& db, const word& arg) | ||
| 47 | { | ||
| 48 | std::list<field> fields; | ||
| 49 | |||
| 50 | fields.emplace_back("word_id", arg.getId()); | ||
| 51 | fields.emplace_back("notion_id", arg.getNotion().getId()); | ||
| 52 | fields.emplace_back("lemma_id", arg.getLemma().getId()); | ||
| 53 | |||
| 54 | if (arg.hasTagCount()) | ||
| 55 | { | ||
| 56 | fields.emplace_back("tag_count", arg.getTagCount()); | ||
| 57 | } | ||
| 58 | |||
| 59 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective) | ||
| 60 | && (arg.getAdjectivePosition() != positioning::undefined)) | ||
| 61 | { | ||
| 62 | fields.emplace_back("position", static_cast<int>(arg.getAdjectivePosition())); | ||
| 63 | } | ||
| 64 | |||
| 65 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb) | ||
| 66 | && (arg.hasVerbGroup())) | ||
| 67 | { | ||
| 68 | fields.emplace_back("group_id", arg.getVerbGroup().getId()); | ||
| 69 | } | ||
| 70 | |||
| 71 | db.insertIntoTable("words", std::move(fields)); | ||
| 72 | |||
| 73 | return db; | ||
| 74 | } | ||
| 75 | |||
| 76 | }; | ||
| 77 | }; | ||
| diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h | |||
| @@ -0,0 +1,110 @@ | |||
| 1 | #ifndef WORD_H_91F99D46 | ||
| 2 | #define WORD_H_91F99D46 | ||
| 3 | |||
| 4 | #include <cassert> | ||
| 5 | #include "enums.h" | ||
| 6 | |||
| 7 | namespace verbly { | ||
| 8 | namespace generator { | ||
| 9 | |||
| 10 | class notion; | ||
| 11 | class lemma; | ||
| 12 | class database; | ||
| 13 | class group; | ||
| 14 | |||
| 15 | class word { | ||
| 16 | public: | ||
| 17 | |||
| 18 | // Constructors | ||
| 19 | |||
| 20 | word(notion& n, lemma& l); | ||
| 21 | |||
| 22 | word(notion& n, lemma& l, int tagCount); | ||
| 23 | |||
| 24 | // Mutators | ||
| 25 | |||
| 26 | void setAdjectivePosition(positioning adjectivePosition); | ||
| 27 | |||
| 28 | void setVerbGroup(const group& verbGroup); | ||
| 29 | |||
| 30 | // Accessors | ||
| 31 | |||
| 32 | int getId() const | ||
| 33 | { | ||
| 34 | return id_; | ||
| 35 | } | ||
| 36 | |||
| 37 | notion& getNotion() | ||
| 38 | { | ||
| 39 | return notion_; | ||
| 40 | } | ||
| 41 | |||
| 42 | const notion& getNotion() const | ||
| 43 | { | ||
| 44 | return notion_; | ||
| 45 | } | ||
| 46 | |||
| 47 | lemma& getLemma() | ||
| 48 | { | ||
| 49 | return lemma_; | ||
| 50 | } | ||
| 51 | |||
| 52 | const lemma& getLemma() const | ||
| 53 | { | ||
| 54 | return lemma_; | ||
| 55 | } | ||
| 56 | |||
| 57 | bool hasTagCount() const | ||
| 58 | { | ||
| 59 | return hasTagCount_; | ||
| 60 | } | ||
| 61 | |||
| 62 | int getTagCount() const | ||
| 63 | { | ||
| 64 | // Calling code should always call hasTagCount first. | ||
| 65 | assert(hasTagCount_); | ||
| 66 | |||
| 67 | return tagCount_; | ||
| 68 | } | ||
| 69 | |||
| 70 | positioning getAdjectivePosition() const | ||
| 71 | { | ||
| 72 | return adjectivePosition_; | ||
| 73 | } | ||
| 74 | |||
| 75 | bool hasVerbGroup() const | ||
| 76 | { | ||
| 77 | return (verbGroup_ != nullptr); | ||
| 78 | } | ||
| 79 | |||
| 80 | const group& getVerbGroup() const | ||
| 81 | { | ||
| 82 | // Calling code should always call hasVerbGroup first. | ||
| 83 | assert(verbGroup_ != nullptr); | ||
| 84 | |||
| 85 | return *verbGroup_; | ||
| 86 | } | ||
| 87 | |||
| 88 | private: | ||
| 89 | |||
| 90 | static int nextId_; | ||
| 91 | |||
| 92 | const int id_; | ||
| 93 | notion& notion_; | ||
| 94 | lemma& lemma_; | ||
| 95 | const int tagCount_ = 0; | ||
| 96 | const bool hasTagCount_ = false; | ||
| 97 | |||
| 98 | positioning adjectivePosition_ = positioning::undefined; | ||
| 99 | const group* verbGroup_ = nullptr; | ||
| 100 | |||
| 101 | }; | ||
| 102 | |||
| 103 | // Serializer | ||
| 104 | |||
| 105 | database& operator<<(database& db, const word& arg); | ||
| 106 | |||
| 107 | }; | ||
| 108 | }; | ||
| 109 | |||
| 110 | #endif /* end of include guard: WORD_H_91F99D46 */ | ||
