diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2017-01-16 18:02:50 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2017-01-16 18:02:50 -0500 |
commit | 6746da6edd7d9d50efe374eabbb79a3cac882d81 (patch) | |
tree | ff20917e08b08d36b9541c1371106596e7bec442 /generator | |
parent | 4af7e55733098ca42f75a4ffaca1b0f6bab4dd36 (diff) | |
download | verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.gz verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.bz2 verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.zip |
Started structural rewrite
The new object structure was designed to build on the existing WordNet structure, while also adding in all of the data that we get from other sources. More information about this can be found on the project wiki. The generator has already been completely rewritten to generate a datafile that uses the new structure. In addition, a number of indexes are created, which does double the size of the datafile, but also allows for much faster lookups. Finally, the new generator is written modularly and is a lot more readable than the old one. The verbly interface to the new object structure has mostly been completed, but has not been tested fully. There is a completely new search API which utilizes a lot of operator overloading; documentation on how to use it should go up at some point. Token processing and verb frames are currently unimplemented. Source for these have been left in the repository for now.
Diffstat (limited to 'generator')
-rw-r--r-- | generator/CMakeLists.txt | 6 | ||||
-rw-r--r-- | generator/database.cpp | 173 | ||||
-rw-r--r-- | generator/database.h | 73 | ||||
-rw-r--r-- | generator/field.cpp | 193 | ||||
-rw-r--r-- | generator/field.h | 76 | ||||
-rw-r--r-- | generator/form.cpp | 53 | ||||
-rw-r--r-- | generator/form.h | 71 | ||||
-rw-r--r-- | generator/frame.cpp | 83 | ||||
-rw-r--r-- | generator/frame.h | 59 | ||||
-rw-r--r-- | generator/generator.cpp | 3145 | ||||
-rw-r--r-- | generator/generator.h | 151 | ||||
-rw-r--r-- | generator/group.cpp | 119 | ||||
-rw-r--r-- | generator/group.h | 80 | ||||
-rw-r--r-- | generator/lemma.cpp | 65 | ||||
-rw-r--r-- | generator/lemma.h | 58 | ||||
-rw-r--r-- | generator/main.cpp | 40 | ||||
-rw-r--r-- | generator/notion.cpp | 85 | ||||
-rw-r--r-- | generator/notion.h | 91 | ||||
-rw-r--r-- | generator/part.cpp | 336 | ||||
-rw-r--r-- | generator/part.h | 114 | ||||
-rw-r--r-- | generator/progress.h | 78 | ||||
-rw-r--r-- | generator/pronunciation.cpp | 87 | ||||
-rw-r--r-- | generator/pronunciation.h | 82 | ||||
-rw-r--r-- | generator/role.h | 35 | ||||
-rw-r--r-- | generator/schema.sql | 352 | ||||
-rw-r--r-- | generator/selrestr.cpp | 288 | ||||
-rw-r--r-- | generator/selrestr.h | 88 | ||||
-rw-r--r-- | generator/word.cpp | 77 | ||||
-rw-r--r-- | generator/word.h | 110 |
29 files changed, 4018 insertions, 2250 deletions
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | cmake_minimum_required (VERSION 2.6) | 1 | cmake_minimum_required (VERSION 3.1) |
2 | project (generator) | 2 | project (generator) |
3 | 3 | ||
4 | find_package(PkgConfig) | 4 | find_package(PkgConfig) |
5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) | 5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) |
6 | find_package(libxml2 REQUIRED) | 6 | find_package(libxml2 REQUIRED) |
7 | 7 | ||
8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) | 8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json) |
9 | add_executable(generator generator.cpp) | 9 | add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp) |
10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) | 10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) |
11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) | 11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) |
12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) | 12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) |
diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp | |||
@@ -0,0 +1,173 @@ | |||
1 | #include "database.h" | ||
2 | #include <sqlite3.h> | ||
3 | #include <cassert> | ||
4 | #include <fstream> | ||
5 | #include <stdexcept> | ||
6 | #include <cstdio> | ||
7 | #include <sstream> | ||
8 | #include "field.h" | ||
9 | #include "../lib/util.h" | ||
10 | |||
11 | namespace verbly { | ||
12 | namespace generator { | ||
13 | |||
14 | sqlite3_error::sqlite3_error( | ||
15 | const std::string& what, | ||
16 | const std::string& db_err) : | ||
17 | what_(what + " (" + db_err + ")"), | ||
18 | db_err_(db_err) | ||
19 | { | ||
20 | } | ||
21 | |||
22 | const char* sqlite3_error::what() const noexcept | ||
23 | { | ||
24 | return what_.c_str(); | ||
25 | } | ||
26 | |||
27 | const char* sqlite3_error::db_err() const noexcept | ||
28 | { | ||
29 | return db_err_.c_str(); | ||
30 | } | ||
31 | |||
32 | database::database(std::string path) | ||
33 | { | ||
34 | // If there is already a file at this path, overwrite it. | ||
35 | if (std::ifstream(path)) | ||
36 | { | ||
37 | if (std::remove(path.c_str())) | ||
38 | { | ||
39 | throw std::logic_error("Could not overwrite file at path"); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
44 | { | ||
45 | // We still have to free the resources allocated. In the event that | ||
46 | // allocation failed, ppdb will be null and sqlite3_close_v2 will just | ||
47 | // ignore it. | ||
48 | std::string errmsg(sqlite3_errmsg(ppdb_)); | ||
49 | sqlite3_close_v2(ppdb_); | ||
50 | |||
51 | throw sqlite3_error("Could not create output datafile", errmsg); | ||
52 | } | ||
53 | } | ||
54 | |||
55 | database::database(database&& other) : database() | ||
56 | { | ||
57 | swap(*this, other); | ||
58 | } | ||
59 | |||
60 | database& database::operator=(database&& other) | ||
61 | { | ||
62 | swap(*this, other); | ||
63 | |||
64 | return *this; | ||
65 | } | ||
66 | |||
67 | void swap(database& first, database& second) | ||
68 | { | ||
69 | std::swap(first.ppdb_, second.ppdb_); | ||
70 | } | ||
71 | |||
72 | database::~database() | ||
73 | { | ||
74 | sqlite3_close_v2(ppdb_); | ||
75 | } | ||
76 | |||
77 | void database::runQuery(std::string query) | ||
78 | { | ||
79 | // This can only happen when doing bad things with move semantics. | ||
80 | assert(ppdb_ != nullptr); | ||
81 | |||
82 | sqlite3_stmt* ppstmt; | ||
83 | |||
84 | if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
85 | { | ||
86 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
87 | } | ||
88 | |||
89 | int result = sqlite3_step(ppstmt); | ||
90 | sqlite3_finalize(ppstmt); | ||
91 | |||
92 | if (result != SQLITE_DONE) | ||
93 | { | ||
94 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
95 | } | ||
96 | } | ||
97 | |||
98 | void database::insertIntoTable(std::string table, std::list<field> fields) | ||
99 | { | ||
100 | // This can only happen when doing bad things with move semantics. | ||
101 | assert(ppdb_ != nullptr); | ||
102 | |||
103 | // This shouldn't happen. | ||
104 | assert(!fields.empty()); | ||
105 | |||
106 | std::list<std::string> fieldNames; | ||
107 | std::list<std::string> qs; | ||
108 | for (field& f : fields) | ||
109 | { | ||
110 | fieldNames.push_back(f.getName()); | ||
111 | qs.push_back("?"); | ||
112 | } | ||
113 | |||
114 | std::ostringstream query; | ||
115 | query << "INSERT INTO "; | ||
116 | query << table; | ||
117 | query << " ("; | ||
118 | query << implode(std::begin(fieldNames), std::end(fieldNames), ", "); | ||
119 | query << ") VALUES ("; | ||
120 | query << implode(std::begin(qs), std::end(qs), ", "); | ||
121 | query << ")"; | ||
122 | |||
123 | std::string query_str = query.str(); | ||
124 | |||
125 | sqlite3_stmt* ppstmt; | ||
126 | |||
127 | if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK) | ||
128 | { | ||
129 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
130 | } | ||
131 | |||
132 | int i = 1; | ||
133 | for (field& f : fields) | ||
134 | { | ||
135 | switch (f.getType()) | ||
136 | { | ||
137 | case field::type::integer: | ||
138 | { | ||
139 | sqlite3_bind_int(ppstmt, i, f.getInteger()); | ||
140 | |||
141 | break; | ||
142 | } | ||
143 | |||
144 | case field::type::string: | ||
145 | { | ||
146 | sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT); | ||
147 | |||
148 | break; | ||
149 | } | ||
150 | |||
151 | case field::type::invalid: | ||
152 | { | ||
153 | // Fields can only be invalid when doing bad things with move semantics. | ||
154 | assert(false); | ||
155 | |||
156 | break; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | i++; | ||
161 | } | ||
162 | |||
163 | int result = sqlite3_step(ppstmt); | ||
164 | sqlite3_finalize(ppstmt); | ||
165 | |||
166 | if (result != SQLITE_DONE) | ||
167 | { | ||
168 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
169 | } | ||
170 | } | ||
171 | |||
172 | }; | ||
173 | }; | ||
diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h | |||
@@ -0,0 +1,73 @@ | |||
1 | #ifndef DATABASE_H_0B0A47D2 | ||
2 | #define DATABASE_H_0B0A47D2 | ||
3 | |||
4 | #include <string> | ||
5 | #include <exception> | ||
6 | #include <list> | ||
7 | |||
8 | struct sqlite3; | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | class field; | ||
14 | |||
15 | class sqlite3_error : public std::exception { | ||
16 | public: | ||
17 | |||
18 | sqlite3_error(const std::string& what, const std::string& db_err); | ||
19 | |||
20 | const char* what() const noexcept override; | ||
21 | const char* db_err() const noexcept; | ||
22 | |||
23 | private: | ||
24 | std::string what_; | ||
25 | std::string db_err_; | ||
26 | |||
27 | }; | ||
28 | |||
29 | class database { | ||
30 | public: | ||
31 | |||
32 | // Constructor | ||
33 | |||
34 | explicit database(std::string path); | ||
35 | |||
36 | // Disable copying | ||
37 | |||
38 | database(const database& other) = delete; | ||
39 | database& operator=(const database& other) = delete; | ||
40 | |||
41 | // Move constructor and move assignment | ||
42 | |||
43 | database(database&& other); | ||
44 | database& operator=(database&& other); | ||
45 | |||
46 | // Swap | ||
47 | |||
48 | friend void swap(database& first, database& second); | ||
49 | |||
50 | // Destructor | ||
51 | |||
52 | ~database(); | ||
53 | |||
54 | // Actions | ||
55 | |||
56 | void runQuery(std::string query); | ||
57 | |||
58 | void insertIntoTable(std::string table, std::list<field> fields); | ||
59 | |||
60 | private: | ||
61 | |||
62 | database() | ||
63 | { | ||
64 | } | ||
65 | |||
66 | sqlite3* ppdb_ = nullptr; | ||
67 | |||
68 | }; | ||
69 | |||
70 | }; | ||
71 | }; | ||
72 | |||
73 | #endif /* end of include guard: DATABASE_H_0B0A47D2 */ | ||
diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp | |||
@@ -0,0 +1,193 @@ | |||
1 | #include "field.h" | ||
2 | #include <stdexcept> | ||
3 | #include <utility> | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | field::field(const field& other) | ||
9 | { | ||
10 | type_ = other.type_; | ||
11 | name_ = other.name_; | ||
12 | |||
13 | switch (type_) | ||
14 | { | ||
15 | case type::integer: | ||
16 | { | ||
17 | integer_ = other.integer_; | ||
18 | |||
19 | break; | ||
20 | } | ||
21 | |||
22 | case type::string: | ||
23 | { | ||
24 | new(&string_) std::string(other.string_); | ||
25 | |||
26 | break; | ||
27 | } | ||
28 | |||
29 | case type::invalid: | ||
30 | { | ||
31 | break; | ||
32 | } | ||
33 | } | ||
34 | } | ||
35 | |||
36 | field::field(field&& other) : field() | ||
37 | { | ||
38 | swap(*this, other); | ||
39 | } | ||
40 | |||
41 | field& field::operator=(field other) | ||
42 | { | ||
43 | swap(*this, other); | ||
44 | |||
45 | return *this; | ||
46 | } | ||
47 | |||
48 | void swap(field& first, field& second) | ||
49 | { | ||
50 | using type = field::type; | ||
51 | |||
52 | type tempType = first.type_; | ||
53 | std::string tempName = std::move(first.name_); | ||
54 | int tempInteger; | ||
55 | std::string tempString; | ||
56 | |||
57 | switch (first.type_) | ||
58 | { | ||
59 | case type::integer: | ||
60 | { | ||
61 | tempInteger = first.integer_; | ||
62 | |||
63 | break; | ||
64 | } | ||
65 | |||
66 | case type::string: | ||
67 | { | ||
68 | tempString = std::move(tempString); | ||
69 | |||
70 | break; | ||
71 | } | ||
72 | |||
73 | case type::invalid: | ||
74 | { | ||
75 | break; | ||
76 | } | ||
77 | } | ||
78 | |||
79 | first.~field(); | ||
80 | |||
81 | first.type_ = second.type_; | ||
82 | first.name_ = std::move(second.name_); | ||
83 | |||
84 | switch (second.type_) | ||
85 | { | ||
86 | case type::integer: | ||
87 | { | ||
88 | first.integer_ = second.integer_; | ||
89 | |||
90 | break; | ||
91 | } | ||
92 | |||
93 | case type::string: | ||
94 | { | ||
95 | new(&first.string_) std::string(std::move(second.string_)); | ||
96 | |||
97 | break; | ||
98 | } | ||
99 | |||
100 | case type::invalid: | ||
101 | { | ||
102 | break; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | second.~field(); | ||
107 | |||
108 | second.type_ = tempType; | ||
109 | second.name_ = std::move(tempName); | ||
110 | |||
111 | switch (tempType) | ||
112 | { | ||
113 | case type::integer: | ||
114 | { | ||
115 | second.integer_ = tempInteger; | ||
116 | |||
117 | break; | ||
118 | } | ||
119 | |||
120 | case type::string: | ||
121 | { | ||
122 | new(&second.string_) std::string(std::move(tempString)); | ||
123 | |||
124 | break; | ||
125 | } | ||
126 | |||
127 | case type::invalid: | ||
128 | { | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | } | ||
133 | |||
134 | field::~field() | ||
135 | { | ||
136 | switch (type_) | ||
137 | { | ||
138 | case type::string: | ||
139 | { | ||
140 | using string_type = std::string; | ||
141 | string_.~string_type(); | ||
142 | |||
143 | break; | ||
144 | } | ||
145 | |||
146 | case type::integer: | ||
147 | case type::invalid: | ||
148 | { | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
154 | field::field( | ||
155 | std::string name, | ||
156 | int arg) : | ||
157 | type_(type::integer), | ||
158 | name_(name), | ||
159 | integer_(arg) | ||
160 | { | ||
161 | } | ||
162 | |||
163 | int field::getInteger() const | ||
164 | { | ||
165 | if (type_ != type::integer) | ||
166 | { | ||
167 | throw std::domain_error("field::getInteger called on non-integer field"); | ||
168 | } | ||
169 | |||
170 | return integer_; | ||
171 | } | ||
172 | |||
173 | field::field( | ||
174 | std::string name, | ||
175 | std::string arg) : | ||
176 | type_(type::string), | ||
177 | name_(name) | ||
178 | { | ||
179 | new(&string_) std::string(arg); | ||
180 | } | ||
181 | |||
182 | std::string field::getString() const | ||
183 | { | ||
184 | if (type_ != type::string) | ||
185 | { | ||
186 | throw std::domain_error("field::getString called on non-string field"); | ||
187 | } | ||
188 | |||
189 | return string_; | ||
190 | } | ||
191 | |||
192 | }; | ||
193 | }; | ||
diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h | |||
@@ -0,0 +1,76 @@ | |||
1 | #ifndef BINDING_H_CAE0B18E | ||
2 | #define BINDING_H_CAE0B18E | ||
3 | |||
4 | #include <string> | ||
5 | |||
6 | namespace verbly { | ||
7 | namespace generator { | ||
8 | |||
9 | class field { | ||
10 | public: | ||
11 | enum class type { | ||
12 | invalid, | ||
13 | integer, | ||
14 | string | ||
15 | }; | ||
16 | |||
17 | // Copy and move constructors | ||
18 | |||
19 | field(const field& other); | ||
20 | field(field&& other); | ||
21 | |||
22 | // Assignment | ||
23 | |||
24 | field& operator=(field other); | ||
25 | |||
26 | // Swap | ||
27 | |||
28 | friend void swap(field& first, field& second); | ||
29 | |||
30 | // Destructor | ||
31 | |||
32 | ~field(); | ||
33 | |||
34 | // Generic accessors | ||
35 | |||
36 | type getType() const | ||
37 | { | ||
38 | return type_; | ||
39 | } | ||
40 | |||
41 | std::string getName() const | ||
42 | { | ||
43 | return name_; | ||
44 | } | ||
45 | |||
46 | // Integer | ||
47 | |||
48 | field(std::string name, int arg); | ||
49 | |||
50 | int getInteger() const; | ||
51 | |||
52 | // String | ||
53 | |||
54 | field(std::string name, std::string arg); | ||
55 | |||
56 | std::string getString() const; | ||
57 | |||
58 | private: | ||
59 | |||
60 | field() | ||
61 | { | ||
62 | } | ||
63 | |||
64 | union { | ||
65 | int integer_; | ||
66 | std::string string_; | ||
67 | }; | ||
68 | |||
69 | type type_ = type::invalid; | ||
70 | std::string name_; | ||
71 | }; | ||
72 | |||
73 | }; | ||
74 | }; | ||
75 | |||
76 | #endif /* end of include guard: BINDING_H_CAE0B18E */ | ||
diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp | |||
@@ -0,0 +1,53 @@ | |||
1 | #include "form.h" | ||
2 | #include <algorithm> | ||
3 | #include <list> | ||
4 | #include "database.h" | ||
5 | #include "field.h" | ||
6 | #include "pronunciation.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | int form::nextId_ = 0; | ||
12 | |||
13 | form::form(std::string text) : | ||
14 | id_(nextId_++), | ||
15 | text_(text), | ||
16 | complexity_(std::count(std::begin(text), std::end(text), ' ') + 1), | ||
17 | proper_(std::any_of(std::begin(text), std::end(text), std::isupper)) | ||
18 | { | ||
19 | } | ||
20 | |||
21 | void form::addPronunciation(const pronunciation& p) | ||
22 | { | ||
23 | pronunciations_.insert(&p); | ||
24 | } | ||
25 | |||
26 | database& operator<<(database& db, const form& arg) | ||
27 | { | ||
28 | // Serialize the form first. | ||
29 | { | ||
30 | std::list<field> fields; | ||
31 | fields.emplace_back("form_id", arg.getId()); | ||
32 | fields.emplace_back("form", arg.getText()); | ||
33 | fields.emplace_back("complexity", arg.getComplexity()); | ||
34 | fields.emplace_back("proper", arg.isProper()); | ||
35 | |||
36 | db.insertIntoTable("forms", std::move(fields)); | ||
37 | } | ||
38 | |||
39 | // Then, serialize the form/pronunciation relationship. | ||
40 | for (const pronunciation* p : arg.getPronunciations()) | ||
41 | { | ||
42 | std::list<field> fields; | ||
43 | fields.emplace_back("form_id", arg.getId()); | ||
44 | fields.emplace_back("pronunciation_id", p->getId()); | ||
45 | |||
46 | db.insertIntoTable("forms_pronunciations", std::move(fields)); | ||
47 | } | ||
48 | |||
49 | return db; | ||
50 | } | ||
51 | |||
52 | }; | ||
53 | }; | ||
diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h | |||
@@ -0,0 +1,71 @@ | |||
1 | #ifndef FORM_H_7EFBC970 | ||
2 | #define FORM_H_7EFBC970 | ||
3 | |||
4 | #include <string> | ||
5 | #include <set> | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class pronunciation; | ||
11 | class database; | ||
12 | |||
13 | class form { | ||
14 | public: | ||
15 | |||
16 | // Constructor | ||
17 | |||
18 | explicit form(std::string text); | ||
19 | |||
20 | // Mutators | ||
21 | |||
22 | void addPronunciation(const pronunciation& p); | ||
23 | |||
24 | // Accessors | ||
25 | |||
26 | int getId() const | ||
27 | { | ||
28 | return id_; | ||
29 | } | ||
30 | |||
31 | std::string getText() const | ||
32 | { | ||
33 | return text_; | ||
34 | } | ||
35 | |||
36 | int getComplexity() const | ||
37 | { | ||
38 | return complexity_; | ||
39 | } | ||
40 | |||
41 | bool isProper() const | ||
42 | { | ||
43 | return proper_; | ||
44 | } | ||
45 | |||
46 | std::set<const pronunciation*> getPronunciations() const | ||
47 | { | ||
48 | return pronunciations_; | ||
49 | } | ||
50 | |||
51 | private: | ||
52 | |||
53 | static int nextId_; | ||
54 | |||
55 | const int id_; | ||
56 | const std::string text_; | ||
57 | const int complexity_; | ||
58 | const bool proper_; | ||
59 | |||
60 | std::set<const pronunciation*> pronunciations_; | ||
61 | |||
62 | }; | ||
63 | |||
64 | // Serializer | ||
65 | |||
66 | database& operator<<(database& db, const form& arg); | ||
67 | |||
68 | }; | ||
69 | }; | ||
70 | |||
71 | #endif /* end of include guard: FORM_H_7EFBC970 */ | ||
diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp | |||
@@ -0,0 +1,83 @@ | |||
1 | #include "frame.h" | ||
2 | #include "database.h" | ||
3 | #include "field.h" | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | int frame::nextId_ = 0; | ||
9 | |||
10 | frame::frame() : id_(nextId_++) | ||
11 | { | ||
12 | } | ||
13 | |||
14 | void frame::push_back(part fp) | ||
15 | { | ||
16 | parts_.push_back(std::move(fp)); | ||
17 | } | ||
18 | |||
19 | database& operator<<(database& db, const frame& arg) | ||
20 | { | ||
21 | std::list<field> fields; | ||
22 | fields.emplace_back("frame_id", arg.getId()); | ||
23 | |||
24 | nlohmann::json jsonParts; | ||
25 | for (const part& p : arg) | ||
26 | { | ||
27 | nlohmann::json jsonPart; | ||
28 | jsonPart["type"] = static_cast<int>(p.getType()); | ||
29 | |||
30 | switch (p.getType()) | ||
31 | { | ||
32 | case part::type::noun_phrase: | ||
33 | { | ||
34 | jsonPart["role"] = p.getNounRole(); | ||
35 | jsonPart["selrestrs"] = p.getNounSelrestrs().toJson(); | ||
36 | jsonPart["synrestrs"] = p.getNounSynrestrs(); | ||
37 | |||
38 | break; | ||
39 | } | ||
40 | |||
41 | case part::type::preposition: | ||
42 | { | ||
43 | jsonPart["choices"] = p.getPrepositionChoices(); | ||
44 | jsonPart["literal"] = p.isPrepositionLiteral(); | ||
45 | |||
46 | break; | ||
47 | } | ||
48 | |||
49 | case part::type::literal: | ||
50 | { | ||
51 | jsonPart["value"] = p.getLiteralValue(); | ||
52 | |||
53 | break; | ||
54 | } | ||
55 | |||
56 | case part::type::verb: | ||
57 | case part::type::adjective: | ||
58 | case part::type::adverb: | ||
59 | { | ||
60 | break; | ||
61 | } | ||
62 | |||
63 | case part::type::invalid: | ||
64 | { | ||
65 | // Invalid parts should not be serialized. | ||
66 | assert(false); | ||
67 | |||
68 | break; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | jsonParts.emplace_back(std::move(jsonPart)); | ||
73 | } | ||
74 | |||
75 | fields.emplace_back("data", jsonParts.dump()); | ||
76 | |||
77 | db.insertIntoTable("frames", std::move(fields)); | ||
78 | |||
79 | return db; | ||
80 | } | ||
81 | |||
82 | }; | ||
83 | }; | ||
diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h | |||
@@ -0,0 +1,59 @@ | |||
1 | #ifndef FRAME_H_26770FF1 | ||
2 | #define FRAME_H_26770FF1 | ||
3 | |||
4 | #include <list> | ||
5 | #include "part.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class database; | ||
11 | |||
12 | class frame { | ||
13 | public: | ||
14 | |||
15 | // Aliases | ||
16 | |||
17 | using const_iterator = std::list<part>::const_iterator; | ||
18 | |||
19 | // Constructor | ||
20 | |||
21 | frame(); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void push_back(part fp); | ||
26 | |||
27 | // Accessors | ||
28 | |||
29 | int getId() const | ||
30 | { | ||
31 | return id_; | ||
32 | } | ||
33 | |||
34 | const_iterator begin() const | ||
35 | { | ||
36 | return std::begin(parts_); | ||
37 | } | ||
38 | |||
39 | const_iterator end() const | ||
40 | { | ||
41 | return std::end(parts_); | ||
42 | } | ||
43 | |||
44 | private: | ||
45 | |||
46 | static int nextId_; | ||
47 | |||
48 | const int id_; | ||
49 | |||
50 | std::list<part> parts_; | ||
51 | |||
52 | }; | ||
53 | |||
54 | database& operator<<(database& db, const frame& arg); | ||
55 | |||
56 | }; | ||
57 | }; | ||
58 | |||
59 | #endif /* end of include guard: FRAME_H_26770FF1 */ | ||
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,2320 +1,1477 @@ | |||
1 | #include <libxml/parser.h> | 1 | #include "generator.h" |
2 | #include <cassert> | ||
3 | #include <stdexcept> | ||
2 | #include <iostream> | 4 | #include <iostream> |
5 | #include <regex> | ||
3 | #include <dirent.h> | 6 | #include <dirent.h> |
4 | #include <set> | ||
5 | #include <map> | ||
6 | #include <string> | ||
7 | #include <vector> | ||
8 | #include <fstream> | 7 | #include <fstream> |
9 | #include <sqlite3.h> | 8 | #include "enums.h" |
10 | #include <sstream> | ||
11 | #include <regex> | ||
12 | #include <list> | ||
13 | #include <algorithm> | ||
14 | #include <json.hpp> | ||
15 | #include "progress.h" | 9 | #include "progress.h" |
10 | #include "selrestr.h" | ||
11 | #include "role.h" | ||
12 | #include "part.h" | ||
13 | #include "field.h" | ||
16 | #include "../lib/util.h" | 14 | #include "../lib/util.h" |
17 | 15 | ||
18 | using json = nlohmann::json; | 16 | namespace verbly { |
19 | 17 | namespace generator { | |
20 | struct verb_t { | ||
21 | std::string infinitive; | ||
22 | std::string past_tense; | ||
23 | std::string past_participle; | ||
24 | std::string ing_form; | ||
25 | std::string s_form; | ||
26 | int id; | ||
27 | }; | ||
28 | |||
29 | struct adjective_t { | ||
30 | std::string base; | ||
31 | std::string comparative; | ||
32 | std::string superlative; | ||
33 | }; | ||
34 | |||
35 | struct noun_t { | ||
36 | std::string singular; | ||
37 | std::string plural; | ||
38 | }; | ||
39 | |||
40 | struct selrestr_t { | ||
41 | enum class type_t { | ||
42 | singleton, | ||
43 | andlogic, | ||
44 | orlogic, | ||
45 | empty | ||
46 | }; | ||
47 | type_t type; | ||
48 | std::string restriction; | ||
49 | bool pos; | ||
50 | std::list<selrestr_t> subordinates; | ||
51 | }; | ||
52 | |||
53 | struct framepart_t { | ||
54 | enum class type_t { | ||
55 | np, | ||
56 | v, | ||
57 | pp, | ||
58 | adj, | ||
59 | adv, | ||
60 | lex | ||
61 | }; | ||
62 | type_t type; | ||
63 | std::string role; | ||
64 | selrestr_t selrestrs; | ||
65 | std::set<std::string> preprestrs; | ||
66 | std::set<std::string> synrestrs; | ||
67 | std::list<std::string> choices; | ||
68 | std::string lexval; | ||
69 | }; | ||
70 | |||
71 | struct group_t { | ||
72 | std::string id; | ||
73 | std::string parent; | ||
74 | std::set<std::string> members; | ||
75 | std::map<std::string, selrestr_t> roles; | ||
76 | std::list<std::list<framepart_t>> frames; | ||
77 | }; | ||
78 | |||
79 | struct pronunciation_t { | ||
80 | std::string phonemes; | ||
81 | std::string prerhyme; | ||
82 | std::string rhyme; | ||
83 | int syllables = 0; | ||
84 | std::string stress; | ||
85 | |||
86 | bool operator<(const pronunciation_t& other) const | ||
87 | { | ||
88 | return phonemes < other.phonemes; | ||
89 | } | ||
90 | }; | ||
91 | |||
92 | std::map<std::string, group_t> groups; | ||
93 | std::map<std::string, verb_t> verbs; | ||
94 | std::map<std::string, adjective_t> adjectives; | ||
95 | std::map<std::string, noun_t> nouns; | ||
96 | std::map<int, std::map<int, int>> wn; | ||
97 | std::map<int, int> images; | ||
98 | std::map<std::string, std::set<pronunciation_t>> pronunciations; | ||
99 | |||
100 | void print_usage() | ||
101 | { | ||
102 | std::cout << "Verbly Datafile Generator" << std::endl; | ||
103 | std::cout << "-------------------------" << std::endl; | ||
104 | std::cout << "Requires exactly six arguments." << std::endl; | ||
105 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | ||
106 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; | ||
107 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; | ||
108 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; | ||
109 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; | ||
110 | std::cout << "6. Datafile output path." << std::endl; | ||
111 | |||
112 | exit(1); | ||
113 | } | ||
114 | |||
115 | void db_error(sqlite3* ppdb, std::string query) | ||
116 | { | ||
117 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
118 | std::cout << query << std::endl; | ||
119 | sqlite3_close_v2(ppdb); | ||
120 | print_usage(); | ||
121 | } | ||
122 | |||
123 | json export_selrestrs(selrestr_t r) | ||
124 | { | ||
125 | if (r.type == selrestr_t::type_t::empty) | ||
126 | { | ||
127 | return {}; | ||
128 | } else if (r.type == selrestr_t::type_t::singleton) | ||
129 | { | ||
130 | json result; | ||
131 | result["type"] = r.restriction; | ||
132 | result["pos"] = r.pos; | ||
133 | return result; | ||
134 | } else { | ||
135 | json result; | ||
136 | if (r.type == selrestr_t::type_t::andlogic) | ||
137 | { | ||
138 | result["logic"] = "and"; | ||
139 | } else { | ||
140 | result["logic"] = "or"; | ||
141 | } | ||
142 | |||
143 | std::list<json> outlist; | ||
144 | std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); | ||
145 | result["children"] = outlist; | ||
146 | 18 | ||
147 | return result; | 19 | generator::generator( |
148 | } | 20 | std::string verbNetPath, |
149 | } | 21 | std::string agidPath, |
150 | 22 | std::string wordNetPath, | |
151 | selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) | 23 | std::string cmudictPath, |
152 | { | 24 | std::string imageNetPath, |
153 | selrestr_t r; | 25 | std::string outputPath) : |
154 | xmlChar* key; | 26 | verbNetPath_(verbNetPath), |
155 | 27 | agidPath_(agidPath), | |
156 | if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) | 28 | wordNetPath_(wordNetPath), |
157 | { | 29 | cmudictPath_(cmudictPath), |
158 | if (xmlChildElementCount(top) == 0) | 30 | imageNetPath_(imageNetPath), |
31 | db_(outputPath) | ||
159 | { | 32 | { |
160 | r.type = selrestr_t::type_t::empty; | 33 | // Ensure VerbNet directory exists |
161 | } else if (xmlChildElementCount(top) == 1) | 34 | DIR* dir; |
162 | { | 35 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
163 | r = parse_selrestrs(xmlFirstElementChild(top), filename); | ||
164 | } else { | ||
165 | r.type = selrestr_t::type_t::andlogic; | ||
166 | |||
167 | if (xmlHasProp(top, (const xmlChar*) "logic")) | ||
168 | { | 36 | { |
169 | key = xmlGetProp(top, (const xmlChar*) "logic"); | 37 | throw std::invalid_argument("Invalid VerbNet data directory"); |
170 | if (!xmlStrcmp(key, (const xmlChar*) "or")) | ||
171 | { | ||
172 | r.type = selrestr_t::type_t::orlogic; | ||
173 | } | ||
174 | xmlFree(key); | ||
175 | } | 38 | } |
176 | 39 | ||
177 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | 40 | closedir(dir); |
41 | |||
42 | // Ensure AGID infl.txt exists | ||
43 | if (!std::ifstream(agidPath_)) | ||
178 | { | 44 | { |
179 | if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) | 45 | throw std::invalid_argument("AGID infl.txt file not found"); |
180 | { | ||
181 | r.subordinates.push_back(parse_selrestrs(selrestr, filename)); | ||
182 | } | ||
183 | } | 46 | } |
184 | } | 47 | |
185 | } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) | 48 | // Add directory separator to WordNet path |
186 | { | 49 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) |
187 | r.type = selrestr_t::type_t::singleton; | ||
188 | |||
189 | key = xmlGetProp(top, (xmlChar*) "Value"); | ||
190 | r.pos = (std::string((const char*)key) == "+"); | ||
191 | xmlFree(key); | ||
192 | |||
193 | key = xmlGetProp(top, (xmlChar*) "type"); | ||
194 | r.restriction = (const char*) key; | ||
195 | xmlFree(key); | ||
196 | } else { | ||
197 | // Invalid | ||
198 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
199 | print_usage(); | ||
200 | } | ||
201 | |||
202 | return r; | ||
203 | } | ||
204 | |||
205 | group_t& parse_group(xmlNodePtr top, std::string filename) | ||
206 | { | ||
207 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | ||
208 | if (key == 0) | ||
209 | { | ||
210 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
211 | print_usage(); | ||
212 | } | ||
213 | std::string vnid = (const char*)key; | ||
214 | vnid = vnid.substr(vnid.find_first_of("-")+1); | ||
215 | xmlFree(key); | ||
216 | |||
217 | group_t g; | ||
218 | g.id = vnid; | ||
219 | |||
220 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
221 | { | ||
222 | if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) | ||
223 | { | ||
224 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) | ||
225 | { | 50 | { |
226 | if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) | 51 | wordNetPath_ += '/'; |
227 | { | ||
228 | auto& sg = parse_group(subclass, filename); | ||
229 | sg.parent = vnid; | ||
230 | |||
231 | for (auto member : sg.members) | ||
232 | { | ||
233 | g.members.insert(member); | ||
234 | } | ||
235 | |||
236 | // The schema requires that subclasses appear after role definitions, so we can do this now | ||
237 | for (auto role : g.roles) | ||
238 | { | ||
239 | if (sg.roles.count(role.first) == 0) | ||
240 | { | ||
241 | sg.roles[role.first] = role.second; | ||
242 | } | ||
243 | } | ||
244 | } | ||
245 | } | 52 | } |
246 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | 53 | |
247 | { | 54 | // Ensure WordNet tables exist |
248 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | 55 | for (std::string table : { |
56 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" | ||
57 | }) | ||
249 | { | 58 | { |
250 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | 59 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) |
251 | { | 60 | { |
252 | key = xmlGetProp(member, (xmlChar*) "name"); | 61 | throw std::invalid_argument("WordNet " + table + " table not found"); |
253 | g.members.insert((const char*)key); | ||
254 | xmlFree(key); | ||
255 | } | 62 | } |
256 | } | 63 | } |
257 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) | 64 | |
258 | { | 65 | // Ensure CMUDICT file exists |
259 | for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) | 66 | if (!std::ifstream(cmudictPath_)) |
260 | { | 67 | { |
261 | if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) | 68 | throw std::invalid_argument("CMUDICT file not found"); |
262 | { | ||
263 | selrestr_t r; | ||
264 | r.type = selrestr_t::type_t::empty; | ||
265 | |||
266 | key = xmlGetProp(role, (const xmlChar*) "type"); | ||
267 | std::string type = (const char*)key; | ||
268 | xmlFree(key); | ||
269 | |||
270 | for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
271 | { | ||
272 | if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) | ||
273 | { | ||
274 | r = parse_selrestrs(rolenode, filename); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | g.roles[type] = r; | ||
279 | } | ||
280 | } | 69 | } |
281 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | 70 | |
282 | { | 71 | // Ensure ImageNet urls.txt exists |
283 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | 72 | if (!std::ifstream(imageNetPath_)) |
284 | { | 73 | { |
285 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | 74 | throw std::invalid_argument("ImageNet urls.txt file not found"); |
286 | { | ||
287 | std::list<framepart_t> f; | ||
288 | |||
289 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | ||
290 | { | ||
291 | if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) | ||
292 | { | ||
293 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
294 | { | ||
295 | framepart_t fp; | ||
296 | |||
297 | if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) | ||
298 | { | ||
299 | fp.type = framepart_t::type_t::np; | ||
300 | |||
301 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
302 | fp.role = (const char*)key; | ||
303 | xmlFree(key); | ||
304 | |||
305 | fp.selrestrs.type = selrestr_t::type_t::empty; | ||
306 | |||
307 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
308 | { | ||
309 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) | ||
310 | { | ||
311 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
312 | { | ||
313 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) | ||
314 | { | ||
315 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
316 | fp.synrestrs.insert(std::string((const char*)key)); | ||
317 | xmlFree(key); | ||
318 | } | ||
319 | } | ||
320 | } | ||
321 | |||
322 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
323 | { | ||
324 | fp.selrestrs = parse_selrestrs(npnode, filename); | ||
325 | } | ||
326 | } | ||
327 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) | ||
328 | { | ||
329 | fp.type = framepart_t::type_t::v; | ||
330 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) | ||
331 | { | ||
332 | fp.type = framepart_t::type_t::pp; | ||
333 | |||
334 | if (xmlHasProp(syntaxnode, (xmlChar*) "value")) | ||
335 | { | ||
336 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
337 | std::string choices = (const char*)key; | ||
338 | xmlFree(key); | ||
339 | |||
340 | fp.choices = verbly::split<std::list<std::string>>(choices, " "); | ||
341 | } | ||
342 | |||
343 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
344 | { | ||
345 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
346 | { | ||
347 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
348 | { | ||
349 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) | ||
350 | { | ||
351 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
352 | fp.preprestrs.insert(std::string((const char*)key)); | ||
353 | xmlFree(key); | ||
354 | } | ||
355 | } | ||
356 | } | ||
357 | } | ||
358 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) | ||
359 | { | ||
360 | fp.type = framepart_t::type_t::adj; | ||
361 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) | ||
362 | { | ||
363 | fp.type = framepart_t::type_t::adv; | ||
364 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) | ||
365 | { | ||
366 | fp.type = framepart_t::type_t::lex; | ||
367 | |||
368 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
369 | fp.lexval = (const char*)key; | ||
370 | xmlFree(key); | ||
371 | } else { | ||
372 | continue; | ||
373 | } | ||
374 | |||
375 | f.push_back(fp); | ||
376 | } | ||
377 | |||
378 | g.frames.push_back(f); | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | } | 75 | } |
383 | } | 76 | } |
384 | } | ||
385 | |||
386 | groups[vnid] = g; | ||
387 | |||
388 | return groups[vnid]; | ||
389 | } | ||
390 | |||
391 | int main(int argc, char** argv) | ||
392 | { | ||
393 | if (argc != 7) | ||
394 | { | ||
395 | print_usage(); | ||
396 | } | ||
397 | |||
398 | // VerbNet data | ||
399 | std::cout << "Reading verb frames..." << std::endl; | ||
400 | |||
401 | DIR* dir; | ||
402 | if ((dir = opendir(argv[1])) == nullptr) | ||
403 | { | ||
404 | std::cout << "Invalid VerbNet data directory." << std::endl; | ||
405 | |||
406 | print_usage(); | ||
407 | } | ||
408 | |||
409 | struct dirent* ent; | ||
410 | while ((ent = readdir(dir)) != nullptr) | ||
411 | { | ||
412 | std::string filename(argv[1]); | ||
413 | if (filename.back() != '/') | ||
414 | { | ||
415 | filename += '/'; | ||
416 | } | ||
417 | 77 | ||
418 | filename += ent->d_name; | 78 | void generator::run() |
419 | //std::cout << ent->d_name << std::endl; | ||
420 | |||
421 | if (filename.rfind(".xml") != filename.size() - 4) | ||
422 | { | ||
423 | continue; | ||
424 | } | ||
425 | |||
426 | xmlDocPtr doc = xmlParseFile(filename.c_str()); | ||
427 | if (doc == nullptr) | ||
428 | { | ||
429 | std::cout << "Error opening " << filename << std::endl; | ||
430 | print_usage(); | ||
431 | } | ||
432 | |||
433 | xmlNodePtr top = xmlDocGetRootElement(doc); | ||
434 | if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) | ||
435 | { | ||
436 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
437 | print_usage(); | ||
438 | } | ||
439 | |||
440 | parse_group(top, filename); | ||
441 | } | ||
442 | |||
443 | closedir(dir); | ||
444 | |||
445 | // Get verbs from AGID | ||
446 | std::cout << "Reading inflections..." << std::endl; | ||
447 | |||
448 | std::ifstream agidfile(argv[2]); | ||
449 | if (!agidfile.is_open()) | ||
450 | { | ||
451 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; | ||
452 | print_usage(); | ||
453 | } | ||
454 | |||
455 | for (;;) | ||
456 | { | ||
457 | std::string line; | ||
458 | if (!getline(agidfile, line)) | ||
459 | { | ||
460 | break; | ||
461 | } | ||
462 | |||
463 | if (line.back() == '\r') | ||
464 | { | 79 | { |
465 | line.pop_back(); | 80 | // Create notions, words, lemmas, and forms from WordNet synsets |
466 | } | 81 | readWordNetSynsets(); |
467 | 82 | ||
468 | int divider = line.find_first_of(" "); | 83 | // Reads adjective positioning WordNet data |
469 | std::string word = line.substr(0, divider); | 84 | readAdjectivePositioning(); |
470 | line = line.substr(divider+1); | 85 | |
471 | char type = line[0]; | 86 | // Counts the number of URLs ImageNet has per notion |
472 | 87 | readImageNetUrls(); | |
473 | if (line[1] == '?') | 88 | |
474 | { | 89 | // Creates a word by WordNet sense key lookup table |
475 | line.erase(0, 4); | 90 | readWordNetSenseKeys(); |
476 | } else { | 91 | |
477 | line.erase(0, 3); | 92 | // Creates groups and frames from VerbNet data |
478 | } | 93 | readVerbNet(); |
479 | 94 | ||
480 | std::vector<std::string> forms; | 95 | // Creates forms and inflections from AGID. To reduce the amount of forms |
481 | while (!line.empty()) | 96 | // created, we do this after most lemmas that need inflecting have been |
482 | { | 97 | // created through other means, and then only generate forms for |
483 | std::string inflection; | 98 | // inflections of already-existing lemmas. The exception to this regards |
484 | if ((divider = line.find(" | ")) != std::string::npos) | 99 | // verb lemmas. If a verb lemma in AGID either does not exist yet, or does |
485 | { | 100 | // exist but is not related to any words that are related to verb notions, |
486 | inflection = line.substr(0, divider); | 101 | // then a notion and a word is generated and the form generation proceeds |
487 | line = line.substr(divider + 3); | 102 | // as usual. |
488 | } else { | 103 | readAgidInflections(); |
489 | inflection = line; | 104 | |
490 | line = ""; | 105 | // Reads in prepositions and the is_a relationship |
491 | } | 106 | readPrepositions(); |
492 | 107 | ||
493 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | 108 | // Creates pronunciations from CMUDICT. To reduce the amount of |
494 | { | 109 | // pronunciations created, we do this after all forms have been created, |
495 | inflection = inflection.substr(0, divider); | 110 | // and then only generate pronunciations for already-exisiting forms. |
496 | } | 111 | readCmudictPronunciations(); |
497 | 112 | ||
498 | forms.push_back(inflection); | 113 | // Writes the database schema |
114 | writeSchema(); | ||
115 | |||
116 | // Dumps data to the database | ||
117 | dumpObjects(); | ||
118 | |||
119 | // Populates the antonymy relationship from WordNet | ||
120 | readWordNetAntonymy(); | ||
121 | |||
122 | // Populates the variation relationship from WordNet | ||
123 | readWordNetVariation(); | ||
124 | |||
125 | // Populates the usage, topicality, and regionality relationships from | ||
126 | // WordNet | ||
127 | readWordNetClasses(); | ||
128 | |||
129 | // Populates the causality relationship from WordNet | ||
130 | readWordNetCausality(); | ||
131 | |||
132 | // Populates the entailment relationship from WordNet | ||
133 | readWordNetEntailment(); | ||
134 | |||
135 | // Populates the hypernymy relationship from WordNet | ||
136 | readWordNetHypernymy(); | ||
137 | |||
138 | // Populates the instantiation relationship from WordNet | ||
139 | readWordNetInstantiation(); | ||
140 | |||
141 | // Populates the member meronymy relationship from WordNet | ||
142 | readWordNetMemberMeronymy(); | ||
143 | |||
144 | // Populates the part meronymy relationship from WordNet | ||
145 | readWordNetPartMeronymy(); | ||
146 | |||
147 | // Populates the substance meronymy relationship from WordNet | ||
148 | readWordNetSubstanceMeronymy(); | ||
149 | |||
150 | // Populates the pertainymy and mannernymy relationships from WordNet | ||
151 | readWordNetPertainymy(); | ||
152 | |||
153 | // Populates the specification relationship from WordNet | ||
154 | readWordNetSpecification(); | ||
155 | |||
156 | // Populates the adjective similarity relationship from WordNet | ||
157 | readWordNetSimilarity(); | ||
158 | |||
159 | |||
160 | |||
161 | |||
162 | |||
163 | |||
164 | |||
165 | |||
499 | } | 166 | } |
500 | 167 | ||
501 | switch (type) | 168 | void generator::readWordNetSynsets() |
502 | { | 169 | { |
503 | case 'V': | 170 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
171 | progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
172 | |||
173 | for (std::string line : lines) | ||
504 | { | 174 | { |
505 | verb_t v; | 175 | ppgs.update(); |
506 | v.infinitive = word; | 176 | |
507 | if (forms.size() == 4) | 177 | std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); |
508 | { | 178 | std::smatch relation_data; |
509 | v.past_tense = forms[0]; | 179 | if (!std::regex_search(line, relation_data, relation)) |
510 | v.past_participle = forms[1]; | 180 | { |
511 | v.ing_form = forms[2]; | 181 | continue; |
512 | v.s_form = forms[3]; | ||
513 | } else if (forms.size() == 3) | ||
514 | { | ||
515 | v.past_tense = forms[0]; | ||
516 | v.past_participle = forms[0]; | ||
517 | v.ing_form = forms[1]; | ||
518 | v.s_form = forms[2]; | ||
519 | } else if (forms.size() == 8) | ||
520 | { | ||
521 | // As of AGID 2014.08.11, this is only "to be" | ||
522 | v.past_tense = forms[0]; | ||
523 | v.past_participle = forms[2]; | ||
524 | v.ing_form = forms[3]; | ||
525 | v.s_form = forms[4]; | ||
526 | } else { | ||
527 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
528 | // - may and shall do not conjugate the way we want them to | ||
529 | // - methinks only has a past tense and is an outlier | ||
530 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
531 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
532 | } | 182 | } |
533 | 183 | ||
534 | verbs[word] = v; | 184 | int synset_id = std::stoi(relation_data[1]); |
535 | 185 | int wnum = std::stoi(relation_data[2]); | |
536 | break; | 186 | std::string text = relation_data[3]; |
537 | } | 187 | int tag_count = std::stoi(relation_data[4]); |
538 | 188 | size_t word_it; | |
539 | case 'A': | 189 | while ((word_it = text.find("''")) != std::string::npos) |
540 | { | ||
541 | adjective_t adj; | ||
542 | adj.base = word; | ||
543 | if (forms.size() == 2) | ||
544 | { | 190 | { |
545 | adj.comparative = forms[0]; | 191 | text.erase(word_it, 1); |
546 | adj.superlative = forms[1]; | ||
547 | } else { | ||
548 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | ||
549 | std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
550 | } | 192 | } |
551 | 193 | ||
552 | adjectives[word] = adj; | 194 | // The WordNet data does contain duplicates, so we need to check that we |
553 | 195 | // haven't already created this word. | |
554 | break; | 196 | std::pair<int, int> lookup(synset_id, wnum); |
555 | } | 197 | if (!wordByWnidAndWnum_.count(lookup)) |
556 | |||
557 | case 'N': | ||
558 | { | ||
559 | noun_t n; | ||
560 | n.singular = word; | ||
561 | if (forms.size() == 1) | ||
562 | { | 198 | { |
563 | n.plural = forms[0]; | 199 | notion& synset = lookupOrCreateNotion(synset_id); |
564 | } else { | 200 | lemma& lex = lookupOrCreateLemma(text); |
565 | // As of AGID 2014.08.11, this is non-existent. | 201 | word& entry = createWord(synset, lex, tag_count); |
566 | std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; | 202 | |
203 | wordByWnidAndWnum_[lookup] = &entry; | ||
567 | } | 204 | } |
568 | |||
569 | nouns[word] = n; | ||
570 | |||
571 | break; | ||
572 | } | 205 | } |
573 | } | 206 | } |
574 | } | ||
575 | |||
576 | // Pronounciations | ||
577 | std::cout << "Reading pronunciations..." << std::endl; | ||
578 | |||
579 | std::ifstream pronfile(argv[4]); | ||
580 | if (!pronfile.is_open()) | ||
581 | { | ||
582 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; | ||
583 | print_usage(); | ||
584 | } | ||
585 | |||
586 | for (;;) | ||
587 | { | ||
588 | std::string line; | ||
589 | if (!getline(pronfile, line)) | ||
590 | { | ||
591 | break; | ||
592 | } | ||
593 | |||
594 | if (line.back() == '\r') | ||
595 | { | ||
596 | line.pop_back(); | ||
597 | } | ||
598 | 207 | ||
599 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | 208 | void generator::readAdjectivePositioning() |
600 | std::smatch phoneme_data; | ||
601 | if (std::regex_search(line, phoneme_data, phoneme)) | ||
602 | { | 209 | { |
603 | std::string canonical(phoneme_data[1]); | 210 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); |
604 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | 211 | progress ppgs("Reading adjective positionings from WordNet...", lines.size()); |
605 | |||
606 | std::string phonemes = phoneme_data[2]; | ||
607 | auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " "); | ||
608 | auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { | ||
609 | return phoneme.find("1") != std::string::npos; | ||
610 | }); | ||
611 | 212 | ||
612 | pronunciation_t p; | 213 | for (std::string line : lines) |
613 | p.phonemes = phonemes; | ||
614 | |||
615 | // Rhyme detection | ||
616 | if (phemstrt != std::end(phoneme_set)) | ||
617 | { | 214 | { |
618 | std::stringstream rhymer; | 215 | ppgs.update(); |
619 | for (auto it = phemstrt; it != std::end(phoneme_set); it++) | ||
620 | { | ||
621 | std::string naked; | ||
622 | std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { | ||
623 | return isdigit(ch); | ||
624 | }); | ||
625 | |||
626 | if (it != phemstrt) | ||
627 | { | ||
628 | rhymer << " "; | ||
629 | } | ||
630 | |||
631 | rhymer << naked; | ||
632 | } | ||
633 | 216 | ||
634 | p.rhyme = rhymer.str(); | 217 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); |
635 | 218 | std::smatch relation_data; | |
636 | if (phemstrt != std::begin(phoneme_set)) | 219 | if (!std::regex_search(line, relation_data, relation)) |
637 | { | 220 | { |
638 | phemstrt--; | 221 | continue; |
639 | p.prerhyme = *phemstrt; | ||
640 | } else { | ||
641 | p.prerhyme = ""; | ||
642 | } | 222 | } |
643 | } else { | ||
644 | p.prerhyme = ""; | ||
645 | p.rhyme = ""; | ||
646 | } | ||
647 | 223 | ||
648 | // Syllable/stress | 224 | int synset_id = stoi(relation_data[1]); |
649 | for (auto phm : phoneme_set) | 225 | int wnum = stoi(relation_data[2]); |
650 | { | 226 | std::string adjpos_str = relation_data[3]; |
651 | if (isdigit(phm.back())) | ||
652 | { | ||
653 | // It's a vowel! | ||
654 | p.syllables++; | ||
655 | 227 | ||
656 | if (phm.back() == '1') | 228 | std::pair<int, int> lookup(synset_id, wnum); |
229 | if (wordByWnidAndWnum_.count(lookup)) | ||
230 | { | ||
231 | word& adj = *wordByWnidAndWnum_.at(lookup); | ||
232 | |||
233 | if (adjpos_str == "p") | ||
234 | { | ||
235 | adj.setAdjectivePosition(positioning::predicate); | ||
236 | } else if (adjpos_str == "a") | ||
237 | { | ||
238 | adj.setAdjectivePosition(positioning::attributive); | ||
239 | } else if (adjpos_str == "i") | ||
657 | { | 240 | { |
658 | p.stress.push_back('1'); | 241 | adj.setAdjectivePosition(positioning::postnominal); |
659 | } else { | 242 | } else { |
660 | p.stress.push_back('0'); | 243 | // Can't happen because of how we specified the regex. |
244 | assert(false); | ||
661 | } | 245 | } |
662 | } | 246 | } |
663 | } | 247 | } |
664 | |||
665 | pronunciations[canonical].insert(p); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | // Images | ||
670 | std::cout << "Reading images..." << std::endl; | ||
671 | |||
672 | std::ifstream imagefile(argv[5]); | ||
673 | if (!imagefile.is_open()) | ||
674 | { | ||
675 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
676 | print_usage(); | ||
677 | } | ||
678 | |||
679 | for (;;) | ||
680 | { | ||
681 | std::string line; | ||
682 | if (!getline(imagefile, line)) | ||
683 | { | ||
684 | break; | ||
685 | } | ||
686 | |||
687 | if (line.back() == '\r') | ||
688 | { | ||
689 | line.pop_back(); | ||
690 | } | ||
691 | |||
692 | std::string wnid_s = line.substr(1, 8); | ||
693 | int wnid = stoi(wnid_s) + 100000000; | ||
694 | images[wnid]++; | ||
695 | } | ||
696 | |||
697 | imagefile.close(); | ||
698 | |||
699 | // Start writing output | ||
700 | std::cout << "Writing schema..." << std::endl; | ||
701 | |||
702 | sqlite3* ppdb; | ||
703 | if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
704 | { | ||
705 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | ||
706 | print_usage(); | ||
707 | } | ||
708 | |||
709 | std::ifstream schemafile("schema.sql"); | ||
710 | if (!schemafile.is_open()) | ||
711 | { | ||
712 | std::cout << "Could not find schema file" << std::endl; | ||
713 | print_usage(); | ||
714 | } | ||
715 | |||
716 | std::stringstream schemabuilder; | ||
717 | for (;;) | ||
718 | { | ||
719 | std::string line; | ||
720 | if (!getline(schemafile, line)) | ||
721 | { | ||
722 | break; | ||
723 | } | ||
724 | |||
725 | if (line.back() == '\r') | ||
726 | { | ||
727 | line.pop_back(); | ||
728 | } | ||
729 | |||
730 | schemabuilder << line << std::endl; | ||
731 | } | ||
732 | |||
733 | std::string schema = schemabuilder.str(); | ||
734 | while (!schema.empty()) | ||
735 | { | ||
736 | std::string query; | ||
737 | int divider = schema.find(";"); | ||
738 | if (divider != std::string::npos) | ||
739 | { | ||
740 | query = schema.substr(0, divider+1); | ||
741 | schema = schema.substr(divider+2); | ||
742 | } else { | ||
743 | break; | ||
744 | } | 248 | } |
745 | 249 | ||
746 | sqlite3_stmt* schmstmt; | 250 | void generator::readImageNetUrls() |
747 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | ||
748 | { | 251 | { |
749 | db_error(ppdb, query); | 252 | // The ImageNet datafile is so large that it is unreasonable and |
750 | } | 253 | // unnecessary to read it into memory; instead, we will parse each line as |
751 | 254 | // we read it. This has the caveat that we cannot display a progress bar. | |
752 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | 255 | std::cout << "Reading image counts from ImageNet..." << std::endl; |
753 | { | ||
754 | db_error(ppdb, query); | ||
755 | } | ||
756 | |||
757 | sqlite3_finalize(schmstmt); | ||
758 | } | ||
759 | |||
760 | std::cout << "Writing prepositions..." << std::endl; | ||
761 | std::ifstream prepfile("prepositions.txt"); | ||
762 | if (!prepfile.is_open()) | ||
763 | { | ||
764 | std::cout << "Could not find prepositions file" << std::endl; | ||
765 | print_usage(); | ||
766 | } | ||
767 | |||
768 | for (;;) | ||
769 | { | ||
770 | std::string line; | ||
771 | if (!getline(prepfile, line)) | ||
772 | { | ||
773 | break; | ||
774 | } | ||
775 | |||
776 | if (line.back() == '\r') | ||
777 | { | ||
778 | line.pop_back(); | ||
779 | } | ||
780 | |||
781 | std::regex relation("^([^:]+): (.+)"); | ||
782 | std::smatch relation_data; | ||
783 | std::regex_search(line, relation_data, relation); | ||
784 | std::string prep = relation_data[1]; | ||
785 | std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", "); | ||
786 | |||
787 | std::string query("INSERT INTO prepositions (form) VALUES (?)"); | ||
788 | sqlite3_stmt* ppstmt; | ||
789 | |||
790 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
791 | { | ||
792 | db_error(ppdb, query); | ||
793 | } | ||
794 | |||
795 | sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); | ||
796 | |||
797 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
798 | { | ||
799 | db_error(ppdb, query); | ||
800 | } | ||
801 | |||
802 | sqlite3_finalize(ppstmt); | ||
803 | |||
804 | query = "SELECT last_insert_rowid()"; | ||
805 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
806 | { | ||
807 | db_error(ppdb, query); | ||
808 | } | ||
809 | |||
810 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
811 | { | ||
812 | db_error(ppdb, query); | ||
813 | } | ||
814 | |||
815 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
816 | sqlite3_finalize(ppstmt); | ||
817 | |||
818 | for (auto group : groups) | ||
819 | { | ||
820 | query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; | ||
821 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
822 | { | ||
823 | db_error(ppdb, query); | ||
824 | } | ||
825 | 256 | ||
826 | sqlite3_bind_int(ppstmt, 1, rowid); | 257 | std::ifstream file(imageNetPath_); |
827 | sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); | 258 | if (!file) |
828 | |||
829 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
830 | { | 259 | { |
831 | db_error(ppdb, query); | 260 | throw std::invalid_argument("Could not find file " + imageNetPath_); |
832 | } | 261 | } |
833 | |||
834 | sqlite3_finalize(ppstmt); | ||
835 | } | ||
836 | } | ||
837 | |||
838 | 262 | ||
839 | { | 263 | std::string line; |
840 | progress ppgs("Writing verbs...", verbs.size()); | 264 | while (std::getline(file, line)) |
841 | for (auto& mapping : verbs) | ||
842 | { | ||
843 | sqlite3_stmt* ppstmt; | ||
844 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | ||
845 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
846 | { | ||
847 | db_error(ppdb, query); | ||
848 | } | ||
849 | |||
850 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); | ||
851 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); | ||
852 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); | ||
853 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); | ||
854 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); | ||
855 | |||
856 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
857 | { | ||
858 | db_error(ppdb, query); | ||
859 | } | ||
860 | |||
861 | sqlite3_finalize(ppstmt); | ||
862 | |||
863 | std::string canonical(mapping.second.infinitive); | ||
864 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
865 | if (pronunciations.count(canonical) == 1) | ||
866 | { | 265 | { |
867 | query = "SELECT last_insert_rowid()"; | 266 | if (line.back() == '\r') |
868 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
869 | { | 267 | { |
870 | db_error(ppdb, query); | 268 | line.pop_back(); |
871 | } | 269 | } |
872 | 270 | ||
873 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | 271 | std::string wnid_s = line.substr(1, 8); |
272 | int wnid = stoi(wnid_s) + 100000000; | ||
273 | if (notionByWnid_.count(wnid)) | ||
874 | { | 274 | { |
875 | db_error(ppdb, query); | 275 | // We know that this notion has a wnid and is a noun. |
876 | } | 276 | notionByWnid_.at(wnid)->incrementNumOfImages(); |
877 | |||
878 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
879 | |||
880 | sqlite3_finalize(ppstmt); | ||
881 | |||
882 | mapping.second.id = rowid; | ||
883 | |||
884 | for (auto pronunciation : pronunciations[canonical]) | ||
885 | { | ||
886 | if (!pronunciation.rhyme.empty()) | ||
887 | { | ||
888 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
889 | } else { | ||
890 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
891 | } | ||
892 | |||
893 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
894 | { | ||
895 | db_error(ppdb, query); | ||
896 | } | ||
897 | |||
898 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
899 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
900 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
901 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
902 | |||
903 | if (!pronunciation.rhyme.empty()) | ||
904 | { | ||
905 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
906 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
907 | } | ||
908 | |||
909 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
910 | { | ||
911 | db_error(ppdb, query); | ||
912 | } | ||
913 | |||
914 | sqlite3_finalize(ppstmt); | ||
915 | } | 277 | } |
916 | } | 278 | } |
917 | |||
918 | ppgs.update(); | ||
919 | } | 279 | } |
920 | } | 280 | |
921 | 281 | void generator::readWordNetSenseKeys() | |
922 | { | ||
923 | progress ppgs("Writing verb frames...", groups.size()); | ||
924 | for (auto& mapping : groups) | ||
925 | { | 282 | { |
926 | std::list<json> roledatal; | 283 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); |
927 | std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { | 284 | progress ppgs("Reading sense keys from WordNet...", lines.size()); |
928 | json role; | ||
929 | role["type"] = r.first; | ||
930 | role["selrestrs"] = export_selrestrs(r.second); | ||
931 | |||
932 | return role; | ||
933 | }); | ||
934 | |||
935 | json roledata(roledatal); | ||
936 | std::string rdm = roledata.dump(); | ||
937 | |||
938 | sqlite3_stmt* ppstmt; | ||
939 | std::string query("INSERT INTO groups (data) VALUES (?)"); | ||
940 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
941 | { | ||
942 | db_error(ppdb, query); | ||
943 | } | ||
944 | |||
945 | sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); | ||
946 | |||
947 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
948 | { | ||
949 | db_error(ppdb, query); | ||
950 | } | ||
951 | 285 | ||
952 | sqlite3_finalize(ppstmt); | 286 | for (std::string line : lines) |
953 | |||
954 | query = "SELECT last_insert_rowid()"; | ||
955 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
956 | { | ||
957 | db_error(ppdb, query); | ||
958 | } | ||
959 | |||
960 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
961 | { | ||
962 | db_error(ppdb, query); | ||
963 | } | ||
964 | |||
965 | int gid = sqlite3_column_int(ppstmt, 0); | ||
966 | sqlite3_finalize(ppstmt); | ||
967 | |||
968 | for (auto frame : mapping.second.frames) | ||
969 | { | 287 | { |
970 | std::list<json> fdatap; | 288 | ppgs.update(); |
971 | std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { | ||
972 | json part; | ||
973 | |||
974 | switch (fp.type) | ||
975 | { | ||
976 | case framepart_t::type_t::np: | ||
977 | { | ||
978 | part["type"] = "np"; | ||
979 | part["role"] = fp.role; | ||
980 | part["selrestrs"] = export_selrestrs(fp.selrestrs); | ||
981 | part["synrestrs"] = fp.synrestrs; | ||
982 | |||
983 | break; | ||
984 | } | ||
985 | |||
986 | case framepart_t::type_t::pp: | ||
987 | { | ||
988 | part["type"] = "pp"; | ||
989 | part["values"] = fp.choices; | ||
990 | part["preprestrs"] = fp.preprestrs; | ||
991 | |||
992 | break; | ||
993 | } | ||
994 | |||
995 | case framepart_t::type_t::v: | ||
996 | { | ||
997 | part["type"] = "v"; | ||
998 | |||
999 | break; | ||
1000 | } | ||
1001 | |||
1002 | case framepart_t::type_t::adj: | ||
1003 | { | ||
1004 | part["type"] = "adj"; | ||
1005 | |||
1006 | break; | ||
1007 | } | ||
1008 | |||
1009 | case framepart_t::type_t::adv: | ||
1010 | { | ||
1011 | part["type"] = "adv"; | ||
1012 | |||
1013 | break; | ||
1014 | } | ||
1015 | |||
1016 | case framepart_t::type_t::lex: | ||
1017 | { | ||
1018 | part["type"] = "lex"; | ||
1019 | part["value"] = fp.lexval; | ||
1020 | |||
1021 | break; | ||
1022 | } | ||
1023 | } | ||
1024 | |||
1025 | return part; | ||
1026 | }); | ||
1027 | |||
1028 | json fdata(fdatap); | ||
1029 | std::string marshall = fdata.dump(); | ||
1030 | |||
1031 | query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; | ||
1032 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1033 | { | ||
1034 | db_error(ppdb, query); | ||
1035 | } | ||
1036 | |||
1037 | sqlite3_bind_int(ppstmt, 1, gid); | ||
1038 | sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); | ||
1039 | 289 | ||
1040 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 290 | // We only actually need to lookup verbs by sense key so we'll just |
291 | // ignore everything that isn't a verb. | ||
292 | std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); | ||
293 | std::smatch relation_data; | ||
294 | if (!std::regex_search(line, relation_data, relation)) | ||
1041 | { | 295 | { |
1042 | db_error(ppdb, query); | 296 | continue; |
1043 | } | 297 | } |
298 | |||
299 | int synset_id = stoi(relation_data[1]); | ||
300 | int wnum = stoi(relation_data[2]); | ||
301 | std::string sense_key = relation_data[3]; | ||
1044 | 302 | ||
1045 | sqlite3_finalize(ppstmt); | 303 | // We are treating this mapping as injective, which is not entirely |
1046 | } | 304 | // accurate. First, the WordNet table contains duplicate rows, so those |
1047 | 305 | // need to be ignored. More importantly, a small number of sense keys | |
1048 | for (auto member : mapping.second.members) | 306 | // (one for each letter of the Latin alphabet, plus 9 other words) each |
1049 | { | 307 | // map to two different words in the same synset which differ only by |
1050 | if (verbs.count(member) == 1) | 308 | // capitalization. Luckily, none of these exceptions are verbs, so we |
309 | // can pretend that the mapping is injective. | ||
310 | if (!wnSenseKeys_.count(sense_key)) | ||
1051 | { | 311 | { |
1052 | auto& v = verbs[member]; | 312 | std::pair<int, int> lookup(synset_id, wnum); |
1053 | 313 | if (wordByWnidAndWnum_.count(lookup)) | |
1054 | query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; | ||
1055 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1056 | { | ||
1057 | db_error(ppdb, query); | ||
1058 | } | ||
1059 | |||
1060 | sqlite3_bind_int(ppstmt, 1, v.id); | ||
1061 | sqlite3_bind_int(ppstmt, 2, gid); | ||
1062 | |||
1063 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1064 | { | 314 | { |
1065 | db_error(ppdb, query); | 315 | wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); |
1066 | } | 316 | } |
1067 | |||
1068 | sqlite3_finalize(ppstmt); | ||
1069 | } | 317 | } |
1070 | } | 318 | } |
1071 | |||
1072 | ppgs.update(); | ||
1073 | } | 319 | } |
1074 | } | 320 | |
1075 | 321 | void generator::readVerbNet() | |
1076 | // Get nouns/adjectives/adverbs from WordNet | ||
1077 | // Useful relations: | ||
1078 | // - s: master list | ||
1079 | // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) | ||
1080 | // - at: variation (e.g. a measurement can be standard or nonstandard) | ||
1081 | // - der: derivation (e.g. happy/happily, happily/happy) | ||
1082 | // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) | ||
1083 | // - ins: instantiation (do we need this? let's see) | ||
1084 | // - mm: member meronymy/holonymy (e.g. family/mother, family/child) | ||
1085 | // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) | ||
1086 | // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) | ||
1087 | // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) | ||
1088 | // mannernymy (e.g. something done quickly is done in a manner that is quick) | ||
1089 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | ||
1090 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | ||
1091 | // - syntax: positioning flags for some adjectives | ||
1092 | std::string wnpref {argv[3]}; | ||
1093 | if (wnpref.back() != '/') | ||
1094 | { | ||
1095 | wnpref += '/'; | ||
1096 | } | ||
1097 | |||
1098 | // s table | ||
1099 | { | ||
1100 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
1101 | if (!wnsfile.is_open()) | ||
1102 | { | 322 | { |
1103 | std::cout << "Invalid WordNet data directory." << std::endl; | 323 | std::cout << "Reading frames from VerbNet..." << std::endl; |
1104 | print_usage(); | ||
1105 | } | ||
1106 | 324 | ||
1107 | std::list<std::string> lines; | 325 | DIR* dir; |
1108 | for (;;) | 326 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
1109 | { | ||
1110 | std::string line; | ||
1111 | if (!getline(wnsfile, line)) | ||
1112 | { | 327 | { |
1113 | break; | 328 | throw std::invalid_argument("Invalid VerbNet data directory"); |
1114 | } | 329 | } |
1115 | 330 | ||
1116 | if (line.back() == '\r') | 331 | struct dirent* ent; |
1117 | { | 332 | while ((ent = readdir(dir)) != nullptr) |
1118 | line.pop_back(); | ||
1119 | } | ||
1120 | |||
1121 | lines.push_back(line); | ||
1122 | } | ||
1123 | |||
1124 | progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); | ||
1125 | for (auto line : lines) | ||
1126 | { | ||
1127 | ppgs.update(); | ||
1128 | |||
1129 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); | ||
1130 | std::smatch relation_data; | ||
1131 | if (!std::regex_search(line, relation_data, relation)) | ||
1132 | { | 333 | { |
1133 | continue; | 334 | std::string filename(verbNetPath_); |
1134 | } | 335 | |
336 | if (filename.back() != '/') | ||
337 | { | ||
338 | filename += '/'; | ||
339 | } | ||
1135 | 340 | ||
1136 | int synset_id = stoi(relation_data[1]); | 341 | filename += ent->d_name; |
1137 | int wnum = stoi(relation_data[2]); | ||
1138 | std::string word = relation_data[3]; | ||
1139 | size_t word_it; | ||
1140 | while ((word_it = word.find("''")) != std::string::npos) | ||
1141 | { | ||
1142 | word.erase(word_it, 1); | ||
1143 | } | ||
1144 | 342 | ||
1145 | std::string query; | 343 | if (filename.rfind(".xml") != filename.size() - 4) |
1146 | switch (synset_id / 100000000) | ||
1147 | { | ||
1148 | case 1: // Noun | ||
1149 | { | 344 | { |
1150 | if (nouns.count(word) == 1) | 345 | continue; |
1151 | { | ||
1152 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1153 | } else { | ||
1154 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; | ||
1155 | } | ||
1156 | |||
1157 | break; | ||
1158 | } | 346 | } |
1159 | 347 | ||
1160 | case 2: // Verb | 348 | xmlDocPtr doc = xmlParseFile(filename.c_str()); |
349 | if (doc == nullptr) | ||
1161 | { | 350 | { |
1162 | // Ignore | 351 | throw std::logic_error("Error opening " + filename); |
1163 | |||
1164 | break; | ||
1165 | } | 352 | } |
1166 | 353 | ||
1167 | case 3: // Adjective | 354 | xmlNodePtr top = xmlDocGetRootElement(doc); |
355 | if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | ||
1168 | { | 356 | { |
1169 | if (adjectives.count(word) == 1) | 357 | throw std::logic_error("Bad VerbNet file format: " + filename); |
1170 | { | ||
1171 | query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | ||
1172 | } else { | ||
1173 | query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; | ||
1174 | } | ||
1175 | |||
1176 | break; | ||
1177 | } | 358 | } |
1178 | 359 | ||
1179 | case 4: // Adverb | 360 | try |
1180 | { | 361 | { |
1181 | if (adjectives.count(word) == 1) | 362 | createGroup(top); |
1182 | { | 363 | } catch (const std::exception& e) |
1183 | query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | 364 | { |
1184 | } else { | 365 | std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); |
1185 | query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; | ||
1186 | } | ||
1187 | |||
1188 | break; | ||
1189 | } | 366 | } |
1190 | } | 367 | } |
368 | |||
369 | closedir(dir); | ||
370 | } | ||
1191 | 371 | ||
1192 | sqlite3_stmt* ppstmt; | 372 | void generator::readAgidInflections() |
1193 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 373 | { |
374 | std::list<std::string> lines(readFile(agidPath_)); | ||
375 | progress ppgs("Reading inflections from AGID...", lines.size()); | ||
376 | |||
377 | for (std::string line : lines) | ||
1194 | { | 378 | { |
1195 | db_error(ppdb, query); | 379 | ppgs.update(); |
1196 | } | 380 | |
381 | int divider = line.find_first_of(" "); | ||
382 | std::string infinitive = line.substr(0, divider); | ||
383 | line = line.substr(divider+1); | ||
384 | char type = line[0]; | ||
1197 | 385 | ||
1198 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); | 386 | if (line[1] == '?') |
1199 | switch (synset_id / 100000000) | ||
1200 | { | ||
1201 | case 1: // Noun | ||
1202 | { | 387 | { |
1203 | sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { | 388 | line.erase(0, 4); |
1204 | return isupper(ch); | 389 | } else { |
1205 | }) ? 1 : 0)); | 390 | line.erase(0, 3); |
1206 | |||
1207 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | ||
1208 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
1209 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
1210 | |||
1211 | if (nouns.count(word) == 1) | ||
1212 | { | ||
1213 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); | ||
1214 | } | ||
1215 | |||
1216 | break; | ||
1217 | } | 391 | } |
1218 | 392 | ||
1219 | case 3: // Adjective | 393 | if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) |
1220 | case 4: // Adverb | ||
1221 | { | 394 | { |
1222 | sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); | 395 | continue; |
1223 | 396 | } | |
1224 | if (adjectives.count(word) == 1) | 397 | |
398 | lemma& curLemma = lookupOrCreateLemma(infinitive); | ||
399 | |||
400 | auto forms = split<std::vector<std::string>>(line, " | "); | ||
401 | for (std::string& inflForm : forms) | ||
402 | { | ||
403 | int sympos = inflForm.find_first_of(",?"); | ||
404 | if (sympos != std::string::npos) | ||
1225 | { | 405 | { |
1226 | sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); | 406 | inflForm = inflForm.substr(0, sympos); |
1227 | sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); | ||
1228 | } | 407 | } |
1229 | |||
1230 | break; | ||
1231 | } | 408 | } |
1232 | } | ||
1233 | 409 | ||
1234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 410 | switch (type) |
1235 | { | ||
1236 | db_error(ppdb, query); | ||
1237 | } | ||
1238 | |||
1239 | sqlite3_finalize(ppstmt); | ||
1240 | |||
1241 | query = "SELECT last_insert_rowid()"; | ||
1242 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1243 | { | ||
1244 | db_error(ppdb, query); | ||
1245 | } | ||
1246 | |||
1247 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
1248 | { | ||
1249 | db_error(ppdb, query); | ||
1250 | } | ||
1251 | |||
1252 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
1253 | wn[synset_id][wnum] = rowid; | ||
1254 | |||
1255 | sqlite3_finalize(ppstmt); | ||
1256 | |||
1257 | std::string canonical(word); | ||
1258 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
1259 | if (pronunciations.count(canonical) == 1) | ||
1260 | { | ||
1261 | for (auto pronunciation : pronunciations[canonical]) | ||
1262 | { | 411 | { |
1263 | switch (synset_id / 100000000) | 412 | case 'V': |
1264 | { | 413 | { |
1265 | case 1: // Noun | 414 | if (forms.size() == 4) |
1266 | { | 415 | { |
1267 | if (!pronunciation.rhyme.empty()) | 416 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1268 | { | 417 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); |
1269 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 418 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); |
1270 | } else { | 419 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); |
1271 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 420 | } else if (forms.size() == 3) |
1272 | } | ||
1273 | |||
1274 | break; | ||
1275 | } | ||
1276 | |||
1277 | case 3: // Adjective | ||
1278 | { | 421 | { |
1279 | if (!pronunciation.rhyme.empty()) | 422 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1280 | { | 423 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); |
1281 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 424 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); |
1282 | } else { | 425 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); |
1283 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 426 | } else if (forms.size() == 8) |
1284 | } | 427 | { |
1285 | 428 | // As of AGID 2014.08.11, this is only "to be" | |
1286 | break; | 429 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
430 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); | ||
431 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); | ||
432 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); | ||
433 | } else { | ||
434 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
435 | // - may and shall do not conjugate the way we want them to | ||
436 | // - methinks only has a past tense and is an outlier | ||
437 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
438 | std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1287 | } | 439 | } |
1288 | 440 | ||
1289 | case 4: // Adverb | 441 | // For verbs in particular, we sometimes create a notion and a word |
442 | // from inflection data. Specifically, if there are not yet any | ||
443 | // verbs existing that have the same infinitive form. "Yet" means | ||
444 | // that this verb appears in the AGID data but not in either WordNet | ||
445 | // or VerbNet. | ||
446 | if (!wordsByBaseForm_.count(infinitive) | ||
447 | || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { | ||
448 | return w->getNotion().getPartOfSpeech() == part_of_speech::verb; | ||
449 | })) | ||
1290 | { | 450 | { |
1291 | if (!pronunciation.rhyme.empty()) | 451 | notion& n = createNotion(part_of_speech::verb); |
1292 | { | 452 | createWord(n, curLemma); |
1293 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1294 | } else { | ||
1295 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
1296 | } | ||
1297 | |||
1298 | break; | ||
1299 | } | 453 | } |
1300 | } | ||
1301 | |||
1302 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1303 | { | ||
1304 | db_error(ppdb, query); | ||
1305 | } | ||
1306 | |||
1307 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
1308 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
1309 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
1310 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
1311 | |||
1312 | if (!pronunciation.rhyme.empty()) | ||
1313 | { | ||
1314 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
1315 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
1316 | } | ||
1317 | 454 | ||
1318 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 455 | break; |
1319 | { | ||
1320 | db_error(ppdb, query); | ||
1321 | } | 456 | } |
1322 | |||
1323 | sqlite3_finalize(ppstmt); | ||
1324 | } | ||
1325 | } | ||
1326 | } | ||
1327 | } | ||
1328 | |||
1329 | // While we're working on s | ||
1330 | { | ||
1331 | progress ppgs("Writing word synonyms...", wn.size()); | ||
1332 | for (auto sense : wn) | ||
1333 | { | ||
1334 | ppgs.update(); | ||
1335 | 457 | ||
1336 | for (auto word1 : sense.second) | 458 | case 'A': |
1337 | { | ||
1338 | for (auto word2 : sense.second) | ||
1339 | { | ||
1340 | if (word1 != word2) | ||
1341 | { | 459 | { |
1342 | std::string query; | 460 | if (forms.size() == 2) |
1343 | switch (sense.first / 100000000) | ||
1344 | { | 461 | { |
1345 | case 1: // Noun | 462 | curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); |
1346 | { | 463 | curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); |
1347 | query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | 464 | } else { |
1348 | 465 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | |
1349 | break; | 466 | std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; |
1350 | } | 467 | } |
1351 | |||
1352 | case 2: // Verb | ||
1353 | { | ||
1354 | // Ignore | ||
1355 | |||
1356 | break; | ||
1357 | } | ||
1358 | |||
1359 | case 3: // Adjective | ||
1360 | { | ||
1361 | query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1362 | 468 | ||
1363 | break; | 469 | break; |
1364 | } | 470 | } |
1365 | 471 | ||
1366 | case 4: // Adverb | 472 | case 'N': |
1367 | { | 473 | { |
1368 | query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | 474 | if (forms.size() == 1) |
1369 | |||
1370 | break; | ||
1371 | } | ||
1372 | } | ||
1373 | |||
1374 | sqlite3_stmt* ppstmt; | ||
1375 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1376 | { | ||
1377 | db_error(ppdb, query); | ||
1378 | } | ||
1379 | |||
1380 | sqlite3_bind_int(ppstmt, 1, word1.second); | ||
1381 | sqlite3_bind_int(ppstmt, 2, word2.second); | ||
1382 | |||
1383 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1384 | { | 475 | { |
1385 | db_error(ppdb, query); | 476 | curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); |
477 | } else { | ||
478 | // As of AGID 2014.08.11, this is non-existent. | ||
479 | std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1386 | } | 480 | } |
1387 | 481 | ||
1388 | sqlite3_finalize(ppstmt); | 482 | break; |
1389 | } | 483 | } |
1390 | } | 484 | } |
1391 | } | 485 | } |
1392 | } | 486 | } |
1393 | } | ||
1394 | |||
1395 | // ant table | ||
1396 | { | ||
1397 | std::ifstream wnantfile(wnpref + "wn_ant.pl"); | ||
1398 | if (!wnantfile.is_open()) | ||
1399 | { | ||
1400 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1401 | print_usage(); | ||
1402 | } | ||
1403 | |||
1404 | std::list<std::string> lines; | ||
1405 | for (;;) | ||
1406 | { | ||
1407 | std::string line; | ||
1408 | if (!getline(wnantfile, line)) | ||
1409 | { | ||
1410 | break; | ||
1411 | } | ||
1412 | 487 | ||
1413 | if (line.back() == '\r') | 488 | void generator::readPrepositions() |
1414 | { | ||
1415 | line.pop_back(); | ||
1416 | } | ||
1417 | |||
1418 | lines.push_back(line); | ||
1419 | } | ||
1420 | |||
1421 | progress ppgs("Writing antonyms...", lines.size()); | ||
1422 | for (auto line : lines) | ||
1423 | { | 489 | { |
1424 | ppgs.update(); | 490 | std::list<std::string> lines(readFile("prepositions.txt")); |
491 | progress ppgs("Reading prepositions...", lines.size()); | ||
1425 | 492 | ||
1426 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | 493 | for (std::string line : lines) |
1427 | std::smatch relation_data; | ||
1428 | if (!std::regex_search(line, relation_data, relation)) | ||
1429 | { | ||
1430 | continue; | ||
1431 | } | ||
1432 | |||
1433 | int synset_id_1 = stoi(relation_data[1]); | ||
1434 | int wnum_1 = stoi(relation_data[2]); | ||
1435 | int synset_id_2 = stoi(relation_data[3]); | ||
1436 | int wnum_2 = stoi(relation_data[4]); | ||
1437 | |||
1438 | std::string query; | ||
1439 | switch (synset_id_1 / 100000000) | ||
1440 | { | 494 | { |
1441 | case 1: // Noun | 495 | ppgs.update(); |
1442 | { | ||
1443 | query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1444 | 496 | ||
1445 | break; | 497 | std::regex relation("^([^:]+): (.+)"); |
1446 | } | 498 | std::smatch relation_data; |
1447 | 499 | std::regex_search(line, relation_data, relation); | |
1448 | case 2: // Verb | 500 | std::string prep = relation_data[1]; |
1449 | { | 501 | auto groups = split<std::list<std::string>>(relation_data[2], ", "); |
1450 | // Ignore | ||
1451 | 502 | ||
1452 | break; | 503 | notion& n = createNotion(part_of_speech::preposition); |
1453 | } | 504 | lemma& l = lookupOrCreateLemma(prep); |
1454 | 505 | word& w = createWord(n, l); | |
1455 | case 3: // Adjective | ||
1456 | { | ||
1457 | query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1458 | 506 | ||
1459 | break; | 507 | n.setPrepositionGroups(groups); |
1460 | } | ||
1461 | |||
1462 | case 4: // Adverb | ||
1463 | { | ||
1464 | query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1465 | |||
1466 | break; | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1470 | sqlite3_stmt* ppstmt; | ||
1471 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1472 | { | ||
1473 | db_error(ppdb, query); | ||
1474 | } | ||
1475 | |||
1476 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1477 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1478 | |||
1479 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1480 | { | ||
1481 | db_error(ppdb, query); | ||
1482 | } | ||
1483 | |||
1484 | sqlite3_finalize(ppstmt); | ||
1485 | } | ||
1486 | } | ||
1487 | |||
1488 | // at table | ||
1489 | { | ||
1490 | std::ifstream wnatfile(wnpref + "wn_at.pl"); | ||
1491 | if (!wnatfile.is_open()) | ||
1492 | { | ||
1493 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1494 | print_usage(); | ||
1495 | } | ||
1496 | |||
1497 | std::list<std::string> lines; | ||
1498 | for (;;) | ||
1499 | { | ||
1500 | std::string line; | ||
1501 | if (!getline(wnatfile, line)) | ||
1502 | { | ||
1503 | break; | ||
1504 | } | 508 | } |
1505 | |||
1506 | if (line.back() == '\r') | ||
1507 | { | ||
1508 | line.pop_back(); | ||
1509 | } | ||
1510 | |||
1511 | lines.push_back(line); | ||
1512 | } | 509 | } |
1513 | 510 | ||
1514 | progress ppgs("Writing variations...", lines.size()); | 511 | void generator::readCmudictPronunciations() |
1515 | for (auto line : lines) | ||
1516 | { | 512 | { |
1517 | ppgs.update(); | 513 | std::list<std::string> lines(readFile(cmudictPath_)); |
514 | progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); | ||
1518 | 515 | ||
1519 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); | 516 | for (std::string line : lines) |
1520 | std::smatch relation_data; | ||
1521 | if (!std::regex_search(line, relation_data, relation)) | ||
1522 | { | 517 | { |
1523 | continue; | 518 | ppgs.update(); |
1524 | } | 519 | |
1525 | 520 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | |
1526 | int synset_id_1 = stoi(relation_data[1]); | 521 | std::smatch phoneme_data; |
1527 | int synset_id_2 = stoi(relation_data[2]); | 522 | if (std::regex_search(line, phoneme_data, phoneme)) |
1528 | std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); | ||
1529 | |||
1530 | for (auto mapping1 : wn[synset_id_1]) | ||
1531 | { | ||
1532 | for (auto mapping2 : wn[synset_id_2]) | ||
1533 | { | 523 | { |
1534 | sqlite3_stmt* ppstmt; | 524 | std::string canonical(phoneme_data[1]); |
1535 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 525 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); |
1536 | { | ||
1537 | db_error(ppdb, query); | ||
1538 | } | ||
1539 | |||
1540 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1541 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1542 | 526 | ||
1543 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 527 | if (!formByText_.count(canonical)) |
1544 | { | 528 | { |
1545 | db_error(ppdb, query); | 529 | continue; |
1546 | } | 530 | } |
1547 | 531 | ||
1548 | sqlite3_finalize(ppstmt); | 532 | std::string phonemes = phoneme_data[2]; |
533 | pronunciations_.emplace_back(phonemes); | ||
534 | pronunciation& p = pronunciations_.back(); | ||
535 | formByText_.at(canonical)->addPronunciation(p); | ||
1549 | } | 536 | } |
1550 | } | 537 | } |
1551 | } | 538 | } |
1552 | } | ||
1553 | |||
1554 | // der table | ||
1555 | { | ||
1556 | std::ifstream wnderfile(wnpref + "wn_der.pl"); | ||
1557 | if (!wnderfile.is_open()) | ||
1558 | { | ||
1559 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1560 | print_usage(); | ||
1561 | } | ||
1562 | 539 | ||
1563 | std::list<std::string> lines; | 540 | void generator::writeSchema() |
1564 | for (;;) | ||
1565 | { | 541 | { |
1566 | std::string line; | 542 | std::ifstream file("schema.sql"); |
1567 | if (!getline(wnderfile, line)) | 543 | if (!file) |
1568 | { | 544 | { |
1569 | break; | 545 | throw std::invalid_argument("Could not find database schema"); |
1570 | } | 546 | } |
1571 | 547 | ||
1572 | if (line.back() == '\r') | 548 | std::ostringstream schemaBuilder; |
549 | std::string line; | ||
550 | while (std::getline(file, line)) | ||
1573 | { | 551 | { |
1574 | line.pop_back(); | 552 | if (line.back() == '\r') |
553 | { | ||
554 | line.pop_back(); | ||
555 | } | ||
556 | |||
557 | schemaBuilder << line; | ||
1575 | } | 558 | } |
1576 | 559 | ||
1577 | lines.push_back(line); | 560 | std::string schema = schemaBuilder.str(); |
561 | auto queries = split<std::list<std::string>>(schema, ";"); | ||
562 | progress ppgs("Writing database schema...", queries.size()); | ||
563 | for (std::string query : queries) | ||
564 | { | ||
565 | if (!queries.empty()) | ||
566 | { | ||
567 | db_.runQuery(query); | ||
568 | } | ||
569 | |||
570 | ppgs.update(); | ||
571 | } | ||
1578 | } | 572 | } |
1579 | 573 | ||
1580 | progress ppgs("Writing morphological derivation...", lines.size()); | 574 | void generator::dumpObjects() |
1581 | for (auto line : lines) | ||
1582 | { | 575 | { |
1583 | ppgs.update(); | ||
1584 | |||
1585 | std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
1586 | std::smatch relation_data; | ||
1587 | if (!std::regex_search(line, relation_data, relation)) | ||
1588 | { | 576 | { |
1589 | continue; | 577 | progress ppgs("Writing notions...", notions_.size()); |
578 | |||
579 | for (notion& n : notions_) | ||
580 | { | ||
581 | db_ << n; | ||
582 | |||
583 | ppgs.update(); | ||
584 | } | ||
1590 | } | 585 | } |
1591 | 586 | ||
1592 | int synset_id_1 = stoi(relation_data[1]); | ||
1593 | int wnum_1 = stoi(relation_data[2]); | ||
1594 | int synset_id_2 = stoi(relation_data[3]); | ||
1595 | int wnum_2 = stoi(relation_data[4]); | ||
1596 | std::string query; | ||
1597 | switch (synset_id_1 / 100000000) | ||
1598 | { | 587 | { |
1599 | case 1: // Noun | 588 | progress ppgs("Writing words...", words_.size()); |
589 | |||
590 | for (word& w : words_) | ||
1600 | { | 591 | { |
1601 | switch (synset_id_2 / 100000000) | 592 | db_ << w; |
1602 | { | ||
1603 | case 1: // Noun | ||
1604 | { | ||
1605 | query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1606 | break; | ||
1607 | } | ||
1608 | |||
1609 | case 3: // Adjective | ||
1610 | { | ||
1611 | query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; | ||
1612 | break; | ||
1613 | } | ||
1614 | |||
1615 | case 4: // Adverb | ||
1616 | { | ||
1617 | query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; | ||
1618 | break; | ||
1619 | } | ||
1620 | } | ||
1621 | 593 | ||
1622 | break; | 594 | ppgs.update(); |
1623 | } | 595 | } |
596 | } | ||
597 | |||
598 | { | ||
599 | progress ppgs("Writing lemmas...", lemmas_.size()); | ||
1624 | 600 | ||
1625 | case 3: // Adjective | 601 | for (lemma& l : lemmas_) |
1626 | { | 602 | { |
1627 | switch (synset_id_2 / 100000000) | 603 | db_ << l; |
1628 | { | ||
1629 | case 1: // Noun | ||
1630 | { | ||
1631 | query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; | ||
1632 | break; | ||
1633 | } | ||
1634 | |||
1635 | case 3: // Adjective | ||
1636 | { | ||
1637 | query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; | ||
1638 | break; | ||
1639 | } | ||
1640 | |||
1641 | case 4: // Adverb | ||
1642 | { | ||
1643 | query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; | ||
1644 | break; | ||
1645 | } | ||
1646 | } | ||
1647 | 604 | ||
1648 | break; | 605 | ppgs.update(); |
1649 | } | 606 | } |
607 | } | ||
608 | |||
609 | { | ||
610 | progress ppgs("Writing forms...", forms_.size()); | ||
1650 | 611 | ||
1651 | case 4: // Adverb | 612 | for (form& f : forms_) |
1652 | { | 613 | { |
1653 | switch (synset_id_2 / 100000000) | 614 | db_ << f; |
1654 | { | ||
1655 | case 1: // Noun | ||
1656 | { | ||
1657 | query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; | ||
1658 | break; | ||
1659 | } | ||
1660 | |||
1661 | case 3: // Adjective | ||
1662 | { | ||
1663 | query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; | ||
1664 | break; | ||
1665 | } | ||
1666 | |||
1667 | case 4: // Adverb | ||
1668 | { | ||
1669 | query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1670 | break; | ||
1671 | } | ||
1672 | } | ||
1673 | 615 | ||
1674 | break; | 616 | ppgs.update(); |
1675 | } | 617 | } |
1676 | } | 618 | } |
1677 | 619 | ||
1678 | sqlite3_stmt* ppstmt; | ||
1679 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1680 | { | 620 | { |
1681 | db_error(ppdb, query); | 621 | progress ppgs("Writing pronunciations...", pronunciations_.size()); |
622 | |||
623 | for (pronunciation& p : pronunciations_) | ||
624 | { | ||
625 | db_ << p; | ||
626 | |||
627 | ppgs.update(); | ||
628 | } | ||
1682 | } | 629 | } |
1683 | 630 | ||
1684 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1685 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1686 | |||
1687 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1688 | { | 631 | { |
1689 | db_error(ppdb, query); | 632 | progress ppgs("Writing verb groups...", groups_.size()); |
633 | |||
634 | for (group& g : groups_) | ||
635 | { | ||
636 | db_ << g; | ||
637 | |||
638 | ppgs.update(); | ||
639 | } | ||
1690 | } | 640 | } |
1691 | 641 | ||
1692 | sqlite3_finalize(ppstmt); | ||
1693 | } | ||
1694 | } | ||
1695 | |||
1696 | // hyp table | ||
1697 | { | ||
1698 | std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); | ||
1699 | if (!wnhypfile.is_open()) | ||
1700 | { | ||
1701 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1702 | print_usage(); | ||
1703 | } | ||
1704 | |||
1705 | std::list<std::string> lines; | ||
1706 | for (;;) | ||
1707 | { | ||
1708 | std::string line; | ||
1709 | if (!getline(wnhypfile, line)) | ||
1710 | { | ||
1711 | break; | ||
1712 | } | ||
1713 | |||
1714 | if (line.back() == '\r') | ||
1715 | { | 642 | { |
1716 | line.pop_back(); | 643 | progress ppgs("Writing verb frames...", frames_.size()); |
644 | |||
645 | for (frame& f : frames_) | ||
646 | { | ||
647 | db_ << f; | ||
648 | |||
649 | ppgs.update(); | ||
650 | } | ||
1717 | } | 651 | } |
1718 | |||
1719 | lines.push_back(line); | ||
1720 | } | 652 | } |
1721 | 653 | ||
1722 | progress ppgs("Writing hypernyms...", lines.size()); | 654 | void generator::readWordNetAntonymy() |
1723 | for (auto line : lines) | ||
1724 | { | 655 | { |
1725 | ppgs.update(); | 656 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); |
1726 | 657 | progress ppgs("Writing antonyms...", lines.size()); | |
1727 | std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); | 658 | for (auto line : lines) |
1728 | std::smatch relation_data; | ||
1729 | if (!std::regex_search(line, relation_data, relation)) | ||
1730 | { | 659 | { |
1731 | continue; | 660 | ppgs.update(); |
1732 | } | ||
1733 | |||
1734 | int synset_id_1 = stoi(relation_data[1]); | ||
1735 | int synset_id_2 = stoi(relation_data[2]); | ||
1736 | std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); | ||
1737 | 661 | ||
1738 | for (auto mapping1 : wn[synset_id_1]) | 662 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); |
1739 | { | 663 | std::smatch relation_data; |
1740 | for (auto mapping2 : wn[synset_id_2]) | 664 | if (!std::regex_search(line, relation_data, relation)) |
1741 | { | 665 | { |
1742 | sqlite3_stmt* ppstmt; | 666 | continue; |
1743 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 667 | } |
1744 | { | 668 | |
1745 | db_error(ppdb, query); | 669 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
1746 | } | 670 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1747 | 671 | ||
1748 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 672 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) |
1749 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 673 | { |
674 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
675 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1750 | 676 | ||
1751 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 677 | std::list<field> fields; |
1752 | { | 678 | fields.emplace_back("antonym_1_id", word1.getId()); |
1753 | db_error(ppdb, query); | 679 | fields.emplace_back("antonym_2_id", word2.getId()); |
1754 | } | ||
1755 | 680 | ||
1756 | sqlite3_finalize(ppstmt); | 681 | db_.insertIntoTable("antonymy", std::move(fields)); |
1757 | } | 682 | } |
1758 | } | 683 | } |
1759 | } | 684 | } |
1760 | } | ||
1761 | |||
1762 | // ins table | ||
1763 | { | ||
1764 | std::ifstream wninsfile(wnpref + "wn_ins.pl"); | ||
1765 | if (!wninsfile.is_open()) | ||
1766 | { | ||
1767 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1768 | print_usage(); | ||
1769 | } | ||
1770 | |||
1771 | std::list<std::string> lines; | ||
1772 | for (;;) | ||
1773 | { | ||
1774 | std::string line; | ||
1775 | if (!getline(wninsfile, line)) | ||
1776 | { | ||
1777 | break; | ||
1778 | } | ||
1779 | 685 | ||
1780 | if (line.back() == '\r') | 686 | void generator::readWordNetVariation() |
687 | { | ||
688 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); | ||
689 | progress ppgs("Writing variation...", lines.size()); | ||
690 | for (auto line : lines) | ||
1781 | { | 691 | { |
1782 | line.pop_back(); | 692 | ppgs.update(); |
1783 | } | ||
1784 | 693 | ||
1785 | lines.push_back(line); | 694 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); |
695 | std::smatch relation_data; | ||
696 | if (!std::regex_search(line, relation_data, relation)) | ||
697 | { | ||
698 | continue; | ||
699 | } | ||
700 | |||
701 | int lookup1 = std::stoi(relation_data[1]); | ||
702 | int lookup2 = std::stoi(relation_data[2]); | ||
703 | |||
704 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
705 | { | ||
706 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
707 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
708 | |||
709 | std::list<field> fields; | ||
710 | fields.emplace_back("noun_id", notion1.getId()); | ||
711 | fields.emplace_back("adjective_id", notion2.getId()); | ||
712 | |||
713 | db_.insertIntoTable("variation", std::move(fields)); | ||
714 | } | ||
715 | } | ||
1786 | } | 716 | } |
1787 | 717 | ||
1788 | progress ppgs("Writing instantiations...", lines.size()); | 718 | void generator::readWordNetClasses() |
1789 | for (auto line : lines) | ||
1790 | { | 719 | { |
1791 | ppgs.update(); | 720 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); |
1792 | 721 | progress ppgs("Writing usage, topicality, and regionality...", lines.size()); | |
1793 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); | 722 | for (auto line : lines) |
1794 | std::smatch relation_data; | ||
1795 | if (!std::regex_search(line, relation_data, relation)) | ||
1796 | { | 723 | { |
1797 | continue; | 724 | ppgs.update(); |
1798 | } | ||
1799 | |||
1800 | int synset_id_1 = stoi(relation_data[1]); | ||
1801 | int synset_id_2 = stoi(relation_data[2]); | ||
1802 | std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); | ||
1803 | 725 | ||
1804 | for (auto mapping1 : wn[synset_id_1]) | 726 | std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); |
1805 | { | 727 | std::smatch relation_data; |
1806 | for (auto mapping2 : wn[synset_id_2]) | 728 | if (!std::regex_search(line, relation_data, relation)) |
729 | { | ||
730 | continue; | ||
731 | } | ||
732 | |||
733 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
734 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
735 | std::string class_type = relation_data[5]; | ||
736 | |||
737 | std::string table_name; | ||
738 | if (class_type == "t") | ||
739 | { | ||
740 | table_name += "topicality"; | ||
741 | } else if (class_type == "u") | ||
742 | { | ||
743 | table_name += "usage"; | ||
744 | } else if (class_type == "r") | ||
745 | { | ||
746 | table_name += "regionality"; | ||
747 | } | ||
748 | |||
749 | std::list<int> leftJoin; | ||
750 | std::list<int> rightJoin; | ||
751 | |||
752 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) | ||
1807 | { | 753 | { |
1808 | sqlite3_stmt* ppstmt; | 754 | std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { |
1809 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 755 | return w->getId(); |
756 | }); | ||
757 | } else if (wordByWnidAndWnum_.count(lookup1)) { | ||
758 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); | ||
759 | } | ||
760 | |||
761 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) | ||
762 | { | ||
763 | std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { | ||
764 | return w->getId(); | ||
765 | }); | ||
766 | } else if (wordByWnidAndWnum_.count(lookup2)) { | ||
767 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); | ||
768 | } | ||
769 | |||
770 | for (int word1 : leftJoin) | ||
771 | { | ||
772 | for (int word2 : rightJoin) | ||
1810 | { | 773 | { |
1811 | db_error(ppdb, query); | 774 | std::list<field> fields; |
1812 | } | 775 | fields.emplace_back("term_id", word1); |
776 | fields.emplace_back("domain_id", word2); | ||
1813 | 777 | ||
1814 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 778 | db_.insertIntoTable(table_name, std::move(fields)); |
1815 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1816 | |||
1817 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1818 | { | ||
1819 | db_error(ppdb, query); | ||
1820 | } | 779 | } |
1821 | |||
1822 | sqlite3_finalize(ppstmt); | ||
1823 | } | 780 | } |
1824 | } | 781 | } |
1825 | } | 782 | } |
1826 | } | ||
1827 | |||
1828 | // mm table | ||
1829 | { | ||
1830 | std::ifstream wnmmfile(wnpref + "wn_mm.pl"); | ||
1831 | if (!wnmmfile.is_open()) | ||
1832 | { | ||
1833 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1834 | print_usage(); | ||
1835 | } | ||
1836 | |||
1837 | std::list<std::string> lines; | ||
1838 | for (;;) | ||
1839 | { | ||
1840 | std::string line; | ||
1841 | if (!getline(wnmmfile, line)) | ||
1842 | { | ||
1843 | break; | ||
1844 | } | ||
1845 | 783 | ||
1846 | if (line.back() == '\r') | 784 | void generator::readWordNetCausality() |
785 | { | ||
786 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); | ||
787 | progress ppgs("Writing causality...", lines.size()); | ||
788 | for (auto line : lines) | ||
1847 | { | 789 | { |
1848 | line.pop_back(); | 790 | ppgs.update(); |
1849 | } | ||
1850 | 791 | ||
1851 | lines.push_back(line); | 792 | std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); |
793 | std::smatch relation_data; | ||
794 | if (!std::regex_search(line, relation_data, relation)) | ||
795 | { | ||
796 | continue; | ||
797 | } | ||
798 | |||
799 | int lookup1 = std::stoi(relation_data[1]); | ||
800 | int lookup2 = std::stoi(relation_data[2]); | ||
801 | |||
802 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
803 | { | ||
804 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
805 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
806 | |||
807 | std::list<field> fields; | ||
808 | fields.emplace_back("effect_id", notion1.getId()); | ||
809 | fields.emplace_back("cause_id", notion2.getId()); | ||
810 | |||
811 | db_.insertIntoTable("causality", std::move(fields)); | ||
812 | } | ||
813 | } | ||
1852 | } | 814 | } |
1853 | 815 | ||
1854 | progress ppgs("Writing member meronyms...", lines.size()); | 816 | void generator::readWordNetEntailment() |
1855 | for (auto line : lines) | ||
1856 | { | 817 | { |
1857 | ppgs.update(); | 818 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); |
1858 | 819 | progress ppgs("Writing entailment...", lines.size()); | |
1859 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); | 820 | for (auto line : lines) |
1860 | std::smatch relation_data; | ||
1861 | if (!std::regex_search(line, relation_data, relation)) | ||
1862 | { | 821 | { |
1863 | continue; | 822 | ppgs.update(); |
1864 | } | ||
1865 | 823 | ||
1866 | int synset_id_1 = stoi(relation_data[1]); | 824 | std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); |
1867 | int synset_id_2 = stoi(relation_data[2]); | 825 | std::smatch relation_data; |
1868 | std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | 826 | if (!std::regex_search(line, relation_data, relation)) |
1869 | |||
1870 | for (auto mapping1 : wn[synset_id_1]) | ||
1871 | { | ||
1872 | for (auto mapping2 : wn[synset_id_2]) | ||
1873 | { | 827 | { |
1874 | sqlite3_stmt* ppstmt; | 828 | continue; |
1875 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 829 | } |
1876 | { | 830 | |
1877 | db_error(ppdb, query); | 831 | int lookup1 = std::stoi(relation_data[1]); |
1878 | } | 832 | int lookup2 = std::stoi(relation_data[2]); |
1879 | 833 | ||
1880 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 834 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
1881 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 835 | { |
836 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
837 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1882 | 838 | ||
1883 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 839 | std::list<field> fields; |
1884 | { | 840 | fields.emplace_back("given_id", notion1.getId()); |
1885 | db_error(ppdb, query); | 841 | fields.emplace_back("entailment_id", notion2.getId()); |
1886 | } | ||
1887 | 842 | ||
1888 | sqlite3_finalize(ppstmt); | 843 | db_.insertIntoTable("entailment", std::move(fields)); |
1889 | } | 844 | } |
1890 | } | 845 | } |
1891 | } | 846 | } |
1892 | } | 847 | |
1893 | 848 | void generator::readWordNetHypernymy() | |
1894 | // ms table | ||
1895 | { | ||
1896 | std::ifstream wnmsfile(wnpref + "wn_ms.pl"); | ||
1897 | if (!wnmsfile.is_open()) | ||
1898 | { | ||
1899 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1900 | print_usage(); | ||
1901 | } | ||
1902 | |||
1903 | std::list<std::string> lines; | ||
1904 | for (;;) | ||
1905 | { | 849 | { |
1906 | std::string line; | 850 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); |
1907 | if (!getline(wnmsfile, line)) | 851 | progress ppgs("Writing hypernymy...", lines.size()); |
852 | for (auto line : lines) | ||
1908 | { | 853 | { |
1909 | break; | 854 | ppgs.update(); |
855 | |||
856 | std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); | ||
857 | std::smatch relation_data; | ||
858 | if (!std::regex_search(line, relation_data, relation)) | ||
859 | { | ||
860 | continue; | ||
861 | } | ||
862 | |||
863 | int lookup1 = std::stoi(relation_data[1]); | ||
864 | int lookup2 = std::stoi(relation_data[2]); | ||
865 | |||
866 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
867 | { | ||
868 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
869 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
870 | |||
871 | std::list<field> fields; | ||
872 | fields.emplace_back("hyponym_id", notion1.getId()); | ||
873 | fields.emplace_back("hypernym_id", notion2.getId()); | ||
874 | |||
875 | db_.insertIntoTable("hypernymy", std::move(fields)); | ||
876 | } | ||
1910 | } | 877 | } |
878 | } | ||
1911 | 879 | ||
1912 | if (line.back() == '\r') | 880 | void generator::readWordNetInstantiation() |
881 | { | ||
882 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); | ||
883 | progress ppgs("Writing instantiation...", lines.size()); | ||
884 | for (auto line : lines) | ||
1913 | { | 885 | { |
1914 | line.pop_back(); | 886 | ppgs.update(); |
1915 | } | ||
1916 | 887 | ||
1917 | lines.push_back(line); | 888 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); |
889 | std::smatch relation_data; | ||
890 | if (!std::regex_search(line, relation_data, relation)) | ||
891 | { | ||
892 | continue; | ||
893 | } | ||
894 | |||
895 | int lookup1 = std::stoi(relation_data[1]); | ||
896 | int lookup2 = std::stoi(relation_data[2]); | ||
897 | |||
898 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
899 | { | ||
900 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
901 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
902 | |||
903 | std::list<field> fields; | ||
904 | fields.emplace_back("instance_id", notion1.getId()); | ||
905 | fields.emplace_back("class_id", notion2.getId()); | ||
906 | |||
907 | db_.insertIntoTable("instantiation", std::move(fields)); | ||
908 | } | ||
909 | } | ||
1918 | } | 910 | } |
1919 | 911 | ||
1920 | progress ppgs("Writing substance meronyms...", lines.size()); | 912 | void generator::readWordNetMemberMeronymy() |
1921 | for (auto line : lines) | ||
1922 | { | 913 | { |
1923 | ppgs.update(); | 914 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); |
1924 | 915 | progress ppgs("Writing member meronymy...", lines.size()); | |
1925 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); | 916 | for (auto line : lines) |
1926 | std::smatch relation_data; | ||
1927 | if (!std::regex_search(line, relation_data, relation)) | ||
1928 | { | 917 | { |
1929 | continue; | 918 | ppgs.update(); |
1930 | } | ||
1931 | |||
1932 | int synset_id_1 = stoi(relation_data[1]); | ||
1933 | int synset_id_2 = stoi(relation_data[2]); | ||
1934 | std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1935 | 919 | ||
1936 | for (auto mapping1 : wn[synset_id_1]) | 920 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); |
1937 | { | 921 | std::smatch relation_data; |
1938 | for (auto mapping2 : wn[synset_id_2]) | 922 | if (!std::regex_search(line, relation_data, relation)) |
1939 | { | 923 | { |
1940 | sqlite3_stmt* ppstmt; | 924 | continue; |
1941 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 925 | } |
1942 | { | 926 | |
1943 | db_error(ppdb, query); | 927 | int lookup1 = std::stoi(relation_data[1]); |
1944 | } | 928 | int lookup2 = std::stoi(relation_data[2]); |
929 | |||
930 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
931 | { | ||
932 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
933 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1945 | 934 | ||
1946 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 935 | std::list<field> fields; |
1947 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 936 | fields.emplace_back("holonym_id", notion1.getId()); |
937 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1948 | 938 | ||
1949 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 939 | db_.insertIntoTable("member_meronymy", std::move(fields)); |
1950 | { | ||
1951 | db_error(ppdb, query); | ||
1952 | } | ||
1953 | |||
1954 | sqlite3_finalize(ppstmt); | ||
1955 | } | 940 | } |
1956 | } | 941 | } |
1957 | } | 942 | } |
1958 | } | 943 | |
1959 | 944 | void generator::readWordNetPartMeronymy() | |
1960 | // mm table | ||
1961 | { | ||
1962 | std::ifstream wnmpfile(wnpref + "wn_mp.pl"); | ||
1963 | if (!wnmpfile.is_open()) | ||
1964 | { | ||
1965 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1966 | print_usage(); | ||
1967 | } | ||
1968 | |||
1969 | std::list<std::string> lines; | ||
1970 | for (;;) | ||
1971 | { | 945 | { |
1972 | std::string line; | 946 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); |
1973 | if (!getline(wnmpfile, line)) | 947 | progress ppgs("Writing part meronymy...", lines.size()); |
948 | for (auto line : lines) | ||
1974 | { | 949 | { |
1975 | break; | 950 | ppgs.update(); |
951 | |||
952 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
953 | std::smatch relation_data; | ||
954 | if (!std::regex_search(line, relation_data, relation)) | ||
955 | { | ||
956 | continue; | ||
957 | } | ||
958 | |||
959 | int lookup1 = std::stoi(relation_data[1]); | ||
960 | int lookup2 = std::stoi(relation_data[2]); | ||
961 | |||
962 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
963 | { | ||
964 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
965 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
966 | |||
967 | std::list<field> fields; | ||
968 | fields.emplace_back("holonym_id", notion1.getId()); | ||
969 | fields.emplace_back("meronym_id", notion2.getId()); | ||
970 | |||
971 | db_.insertIntoTable("part_meronymy", std::move(fields)); | ||
972 | } | ||
1976 | } | 973 | } |
974 | } | ||
1977 | 975 | ||
1978 | if (line.back() == '\r') | 976 | void generator::readWordNetSubstanceMeronymy() |
977 | { | ||
978 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); | ||
979 | progress ppgs("Writing substance meronymy...", lines.size()); | ||
980 | for (auto line : lines) | ||
1979 | { | 981 | { |
1980 | line.pop_back(); | 982 | ppgs.update(); |
1981 | } | ||
1982 | 983 | ||
1983 | lines.push_back(line); | 984 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); |
985 | std::smatch relation_data; | ||
986 | if (!std::regex_search(line, relation_data, relation)) | ||
987 | { | ||
988 | continue; | ||
989 | } | ||
990 | |||
991 | int lookup1 = std::stoi(relation_data[1]); | ||
992 | int lookup2 = std::stoi(relation_data[2]); | ||
993 | |||
994 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
995 | { | ||
996 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
997 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
998 | |||
999 | std::list<field> fields; | ||
1000 | fields.emplace_back("holonym_id", notion1.getId()); | ||
1001 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1002 | |||
1003 | db_.insertIntoTable("substance_meronymy", std::move(fields)); | ||
1004 | } | ||
1005 | } | ||
1984 | } | 1006 | } |
1985 | 1007 | ||
1986 | progress ppgs("Writing part meronyms...", lines.size()); | 1008 | void generator::readWordNetPertainymy() |
1987 | for (auto line : lines) | ||
1988 | { | 1009 | { |
1989 | ppgs.update(); | 1010 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); |
1990 | 1011 | progress ppgs("Writing pertainymy and mannernymy...", lines.size()); | |
1991 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | 1012 | for (auto line : lines) |
1992 | std::smatch relation_data; | ||
1993 | if (!std::regex_search(line, relation_data, relation)) | ||
1994 | { | 1013 | { |
1995 | continue; | 1014 | ppgs.update(); |
1996 | } | ||
1997 | |||
1998 | int synset_id_1 = stoi(relation_data[1]); | ||
1999 | int synset_id_2 = stoi(relation_data[2]); | ||
2000 | std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
2001 | 1015 | ||
2002 | for (auto mapping1 : wn[synset_id_1]) | 1016 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); |
2003 | { | 1017 | std::smatch relation_data; |
2004 | for (auto mapping2 : wn[synset_id_2]) | 1018 | if (!std::regex_search(line, relation_data, relation)) |
2005 | { | 1019 | { |
2006 | sqlite3_stmt* ppstmt; | 1020 | continue; |
2007 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 1021 | } |
2008 | { | 1022 | |
2009 | db_error(ppdb, query); | 1023 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
2010 | } | 1024 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1025 | |||
1026 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1027 | { | ||
1028 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1029 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
2011 | 1030 | ||
2012 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1031 | if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) |
2013 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1032 | { |
1033 | std::list<field> fields; | ||
1034 | fields.emplace_back("pertainym_id", word1.getId()); | ||
1035 | fields.emplace_back("noun_id", word2.getId()); | ||
2014 | 1036 | ||
2015 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1037 | db_.insertIntoTable("pertainymy", std::move(fields)); |
1038 | } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) | ||
2016 | { | 1039 | { |
2017 | db_error(ppdb, query); | 1040 | std::list<field> fields; |
2018 | } | 1041 | fields.emplace_back("mannernym_id", word1.getId()); |
1042 | fields.emplace_back("adjective_id", word2.getId()); | ||
2019 | 1043 | ||
2020 | sqlite3_finalize(ppstmt); | 1044 | db_.insertIntoTable("mannernymy", std::move(fields)); |
1045 | } | ||
2021 | } | 1046 | } |
2022 | } | 1047 | } |
2023 | } | 1048 | } |
2024 | } | ||
2025 | |||
2026 | // per table | ||
2027 | { | ||
2028 | std::ifstream wnperfile(wnpref + "wn_per.pl"); | ||
2029 | if (!wnperfile.is_open()) | ||
2030 | { | ||
2031 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2032 | print_usage(); | ||
2033 | } | ||
2034 | |||
2035 | std::list<std::string> lines; | ||
2036 | for (;;) | ||
2037 | { | ||
2038 | std::string line; | ||
2039 | if (!getline(wnperfile, line)) | ||
2040 | { | ||
2041 | break; | ||
2042 | } | ||
2043 | 1049 | ||
2044 | if (line.back() == '\r') | 1050 | void generator::readWordNetSpecification() |
1051 | { | ||
1052 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); | ||
1053 | progress ppgs("Writing specifications...", lines.size()); | ||
1054 | for (auto line : lines) | ||
2045 | { | 1055 | { |
2046 | line.pop_back(); | 1056 | ppgs.update(); |
1057 | |||
1058 | std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); | ||
1059 | std::smatch relation_data; | ||
1060 | if (!std::regex_search(line, relation_data, relation)) | ||
1061 | { | ||
1062 | continue; | ||
1063 | } | ||
1064 | |||
1065 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
1066 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
1067 | |||
1068 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1069 | { | ||
1070 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1071 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1072 | |||
1073 | std::list<field> fields; | ||
1074 | fields.emplace_back("general_id", word1.getId()); | ||
1075 | fields.emplace_back("specific_id", word2.getId()); | ||
1076 | |||
1077 | db_.insertIntoTable("specification", std::move(fields)); | ||
1078 | } | ||
2047 | } | 1079 | } |
2048 | |||
2049 | lines.push_back(line); | ||
2050 | } | 1080 | } |
2051 | 1081 | ||
2052 | progress ppgs("Writing pertainyms and mannernyms...", lines.size()); | 1082 | void generator::readWordNetSimilarity() |
2053 | for (auto line : lines) | ||
2054 | { | 1083 | { |
2055 | ppgs.update(); | 1084 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); |
2056 | 1085 | progress ppgs("Writing adjective similarity...", lines.size()); | |
2057 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | 1086 | for (auto line : lines) |
2058 | std::smatch relation_data; | ||
2059 | if (!std::regex_search(line, relation_data, relation)) | ||
2060 | { | 1087 | { |
2061 | continue; | 1088 | ppgs.update(); |
2062 | } | ||
2063 | 1089 | ||
2064 | int synset_id_1 = stoi(relation_data[1]); | 1090 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); |
2065 | int wnum_1 = stoi(relation_data[2]); | 1091 | std::smatch relation_data; |
2066 | int synset_id_2 = stoi(relation_data[3]); | 1092 | if (!std::regex_search(line, relation_data, relation)) |
2067 | int wnum_2 = stoi(relation_data[4]); | ||
2068 | std::string query; | ||
2069 | switch (synset_id_1 / 100000000) | ||
2070 | { | ||
2071 | case 3: // Adjective | ||
2072 | { | 1093 | { |
2073 | // This is a pertainym, the second word should be a noun | 1094 | continue; |
2074 | // Technically it can be an adjective but we're ignoring that | ||
2075 | if (synset_id_2 / 100000000 != 1) | ||
2076 | { | ||
2077 | continue; | ||
2078 | } | ||
2079 | |||
2080 | query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; | ||
2081 | |||
2082 | break; | ||
2083 | } | 1095 | } |
1096 | |||
1097 | int lookup1 = std::stoi(relation_data[1]); | ||
1098 | int lookup2 = std::stoi(relation_data[2]); | ||
2084 | 1099 | ||
2085 | case 4: // Adverb | 1100 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
2086 | { | 1101 | { |
2087 | // This is a mannernym, the second word should be an adjective | 1102 | notion& notion1 = *notionByWnid_.at(lookup1); |
2088 | if (synset_id_2 / 100000000 != 3) | 1103 | notion& notion2 = *notionByWnid_.at(lookup2); |
2089 | { | ||
2090 | continue; | ||
2091 | } | ||
2092 | 1104 | ||
2093 | query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; | 1105 | std::list<field> fields; |
1106 | fields.emplace_back("adjective_1_id", notion1.getId()); | ||
1107 | fields.emplace_back("adjective_2_id", notion2.getId()); | ||
2094 | 1108 | ||
2095 | break; | 1109 | db_.insertIntoTable("similarity", std::move(fields)); |
2096 | } | 1110 | } |
2097 | } | 1111 | } |
2098 | 1112 | } | |
2099 | sqlite3_stmt* ppstmt; | ||
2100 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2101 | { | ||
2102 | db_error(ppdb, query); | ||
2103 | } | ||
2104 | |||
2105 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
2106 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
2107 | 1113 | ||
2108 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1114 | std::list<std::string> generator::readFile(std::string path) |
1115 | { | ||
1116 | std::ifstream file(path); | ||
1117 | if (!file) | ||
2109 | { | 1118 | { |
2110 | db_error(ppdb, query); | 1119 | throw std::invalid_argument("Could not find file " + path); |
2111 | } | 1120 | } |
2112 | |||
2113 | sqlite3_finalize(ppstmt); | ||
2114 | } | ||
2115 | } | ||
2116 | 1121 | ||
2117 | // sa table | 1122 | std::list<std::string> lines; |
2118 | { | ||
2119 | std::ifstream wnsafile(wnpref + "wn_sa.pl"); | ||
2120 | if (!wnsafile.is_open()) | ||
2121 | { | ||
2122 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2123 | print_usage(); | ||
2124 | } | ||
2125 | |||
2126 | std::list<std::string> lines; | ||
2127 | for (;;) | ||
2128 | { | ||
2129 | std::string line; | 1123 | std::string line; |
2130 | if (!getline(wnsafile, line)) | 1124 | while (std::getline(file, line)) |
2131 | { | ||
2132 | break; | ||
2133 | } | ||
2134 | |||
2135 | if (line.back() == '\r') | ||
2136 | { | 1125 | { |
2137 | line.pop_back(); | 1126 | if (line.back() == '\r') |
1127 | { | ||
1128 | line.pop_back(); | ||
1129 | } | ||
1130 | |||
1131 | lines.push_back(line); | ||
2138 | } | 1132 | } |
2139 | 1133 | ||
2140 | lines.push_back(line); | 1134 | return lines; |
2141 | } | 1135 | } |
2142 | 1136 | ||
2143 | progress ppgs("Writing specifications...", lines.size()); | 1137 | part_of_speech generator::partOfSpeechByWnid(int wnid) |
2144 | for (auto line : lines) | ||
2145 | { | 1138 | { |
2146 | ppgs.update(); | 1139 | switch (wnid / 100000000) |
2147 | |||
2148 | std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); | ||
2149 | std::smatch relation_data; | ||
2150 | if (!std::regex_search(line, relation_data, relation)) | ||
2151 | { | ||
2152 | continue; | ||
2153 | } | ||
2154 | |||
2155 | int synset_id_1 = stoi(relation_data[1]); | ||
2156 | int wnum_1 = stoi(relation_data[2]); | ||
2157 | int synset_id_2 = stoi(relation_data[3]); | ||
2158 | int wnum_2 = stoi(relation_data[4]); | ||
2159 | std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); | ||
2160 | |||
2161 | sqlite3_stmt* ppstmt; | ||
2162 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2163 | { | 1140 | { |
2164 | db_error(ppdb, query); | 1141 | case 1: return part_of_speech::noun; |
1142 | case 2: return part_of_speech::verb; | ||
1143 | case 3: return part_of_speech::adjective; | ||
1144 | case 4: return part_of_speech::adverb; | ||
1145 | default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); | ||
2165 | } | 1146 | } |
1147 | } | ||
2166 | 1148 | ||
2167 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | 1149 | notion& generator::createNotion(part_of_speech partOfSpeech) |
2168 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | 1150 | { |
1151 | notions_.emplace_back(partOfSpeech); | ||
1152 | |||
1153 | return notions_.back(); | ||
1154 | } | ||
2169 | 1155 | ||
2170 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1156 | notion& generator::lookupOrCreateNotion(int wnid) |
1157 | { | ||
1158 | if (!notionByWnid_.count(wnid)) | ||
2171 | { | 1159 | { |
2172 | db_error(ppdb, query); | 1160 | notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); |
1161 | notionByWnid_[wnid] = ¬ions_.back(); | ||
2173 | } | 1162 | } |
2174 | 1163 | ||
2175 | sqlite3_finalize(ppstmt); | 1164 | return *notionByWnid_.at(wnid); |
2176 | } | ||
2177 | } | ||
2178 | |||
2179 | // sim table | ||
2180 | { | ||
2181 | std::ifstream wnsimfile(wnpref + "wn_sim.pl"); | ||
2182 | if (!wnsimfile.is_open()) | ||
2183 | { | ||
2184 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2185 | print_usage(); | ||
2186 | } | 1165 | } |
2187 | 1166 | ||
2188 | std::list<std::string> lines; | 1167 | lemma& generator::lookupOrCreateLemma(std::string base_form) |
2189 | for (;;) | ||
2190 | { | 1168 | { |
2191 | std::string line; | 1169 | if (!lemmaByBaseForm_.count(base_form)) |
2192 | if (!getline(wnsimfile, line)) | ||
2193 | { | 1170 | { |
2194 | break; | 1171 | lemmas_.emplace_back(lookupOrCreateForm(base_form)); |
1172 | lemmaByBaseForm_[base_form] = &lemmas_.back(); | ||
2195 | } | 1173 | } |
1174 | |||
1175 | return *lemmaByBaseForm_.at(base_form); | ||
1176 | } | ||
2196 | 1177 | ||
2197 | if (line.back() == '\r') | 1178 | form& generator::lookupOrCreateForm(std::string text) |
1179 | { | ||
1180 | if (!formByText_.count(text)) | ||
2198 | { | 1181 | { |
2199 | line.pop_back(); | 1182 | forms_.emplace_back(text); |
1183 | formByText_[text] = &forms_.back(); | ||
2200 | } | 1184 | } |
2201 | 1185 | ||
2202 | lines.push_back(line); | 1186 | return *formByText_[text]; |
2203 | } | 1187 | } |
2204 | 1188 | ||
2205 | progress ppgs("Writing sense synonyms...", lines.size()); | 1189 | template <typename... Args> word& generator::createWord(Args&&... args) |
2206 | for (auto line : lines) | ||
2207 | { | 1190 | { |
2208 | ppgs.update(); | 1191 | words_.emplace_back(std::forward<Args>(args)...); |
1192 | word& w = words_.back(); | ||
2209 | 1193 | ||
2210 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); | 1194 | wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); |
2211 | std::smatch relation_data; | 1195 | |
2212 | if (!std::regex_search(line, relation_data, relation)) | 1196 | if (w.getNotion().hasWnid()) |
2213 | { | 1197 | { |
2214 | continue; | 1198 | wordsByWnid_[w.getNotion().getWnid()].insert(&w); |
2215 | } | 1199 | } |
2216 | 1200 | ||
2217 | int synset_id_1 = stoi(relation_data[1]); | 1201 | return w; |
2218 | int synset_id_2 = stoi(relation_data[2]); | 1202 | } |
2219 | std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); | 1203 | |
1204 | group& generator::createGroup(xmlNodePtr top) | ||
1205 | { | ||
1206 | groups_.emplace_back(); | ||
1207 | group& grp = groups_.back(); | ||
2220 | 1208 | ||
2221 | for (auto mapping1 : wn[synset_id_1]) | 1209 | xmlChar* key; |
1210 | |||
1211 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
2222 | { | 1212 | { |
2223 | for (auto mapping2 : wn[synset_id_2]) | 1213 | if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES"))) |
2224 | { | 1214 | { |
2225 | sqlite3_stmt* ppstmt; | 1215 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) |
2226 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2227 | { | 1216 | { |
2228 | db_error(ppdb, query); | 1217 | if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS"))) |
1218 | { | ||
1219 | try | ||
1220 | { | ||
1221 | group& subgrp = createGroup(subclass); | ||
1222 | subgrp.setParent(grp); | ||
1223 | } catch (const std::exception& e) | ||
1224 | { | ||
1225 | key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID")); | ||
1226 | |||
1227 | if (key == nullptr) | ||
1228 | { | ||
1229 | std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); | ||
1230 | } else { | ||
1231 | std::string subgroupId(reinterpret_cast<const char*>(key)); | ||
1232 | xmlFree(key); | ||
1233 | |||
1234 | std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); | ||
1235 | } | ||
1236 | } | ||
1237 | } | ||
2229 | } | 1238 | } |
2230 | 1239 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS"))) | |
2231 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1240 | { |
2232 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1241 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) |
2233 | |||
2234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2235 | { | 1242 | { |
2236 | db_error(ppdb, query); | 1243 | if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER"))) |
1244 | { | ||
1245 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn")); | ||
1246 | std::string wnSenses(reinterpret_cast<const char*>(key)); | ||
1247 | xmlFree(key); | ||
1248 | |||
1249 | auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); | ||
1250 | if (!wnSenseKeys.empty()) | ||
1251 | { | ||
1252 | std::list<std::string> tempKeys; | ||
1253 | |||
1254 | std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { | ||
1255 | return sense + "::"; | ||
1256 | }); | ||
1257 | |||
1258 | std::list<std::string> filteredKeys; | ||
1259 | |||
1260 | std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { | ||
1261 | return !wnSenseKeys_.count(sense); | ||
1262 | }); | ||
1263 | |||
1264 | wnSenseKeys = std::move(filteredKeys); | ||
1265 | } | ||
1266 | |||
1267 | if (!wnSenseKeys.empty()) | ||
1268 | { | ||
1269 | for (std::string sense : wnSenseKeys) | ||
1270 | { | ||
1271 | word& wordSense = *wnSenseKeys_[sense]; | ||
1272 | wordSense.setVerbGroup(grp); | ||
1273 | } | ||
1274 | } else { | ||
1275 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name")); | ||
1276 | std::string memberName(reinterpret_cast<const char*>(key)); | ||
1277 | xmlFree(key); | ||
1278 | |||
1279 | notion& n = createNotion(part_of_speech::verb); | ||
1280 | lemma& l = lookupOrCreateLemma(memberName); | ||
1281 | word& w = createWord(n, l); | ||
1282 | |||
1283 | w.setVerbGroup(grp); | ||
1284 | } | ||
1285 | } | ||
2237 | } | 1286 | } |
2238 | 1287 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES"))) | |
2239 | sqlite3_reset(ppstmt); | 1288 | { |
2240 | sqlite3_clear_bindings(ppstmt); | 1289 | for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) |
2241 | |||
2242 | sqlite3_bind_int(ppstmt, 1, mapping2.second); | ||
2243 | sqlite3_bind_int(ppstmt, 2, mapping1.second); | ||
2244 | |||
2245 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2246 | { | 1290 | { |
2247 | db_error(ppdb, query); | 1291 | if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE"))) |
1292 | { | ||
1293 | role r; | ||
1294 | |||
1295 | key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type")); | ||
1296 | std::string roleName = reinterpret_cast<const char*>(key); | ||
1297 | xmlFree(key); | ||
1298 | |||
1299 | for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
1300 | { | ||
1301 | if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1302 | { | ||
1303 | r.setSelrestrs(parseSelrestr(rolenode)); | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | grp.addRole(roleName, std::move(r)); | ||
1308 | } | ||
2248 | } | 1309 | } |
1310 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES"))) | ||
1311 | { | ||
1312 | for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) | ||
1313 | { | ||
1314 | if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME"))) | ||
1315 | { | ||
1316 | frames_.emplace_back(); | ||
1317 | frame& fr = frames_.back(); | ||
2249 | 1318 | ||
2250 | sqlite3_finalize(ppstmt); | 1319 | for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) |
1320 | { | ||
1321 | if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX"))) | ||
1322 | { | ||
1323 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
1324 | { | ||
1325 | if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP"))) | ||
1326 | { | ||
1327 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1328 | std::string partRole = reinterpret_cast<const char*>(key); | ||
1329 | xmlFree(key); | ||
1330 | |||
1331 | selrestr partSelrestrs; | ||
1332 | std::set<std::string> partSynrestrs; | ||
1333 | |||
1334 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1335 | { | ||
1336 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS"))) | ||
1337 | { | ||
1338 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1339 | { | ||
1340 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR"))) | ||
1341 | { | ||
1342 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1343 | partSynrestrs.insert(reinterpret_cast<const char*>(key)); | ||
1344 | xmlFree(key); | ||
1345 | } | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1350 | { | ||
1351 | partSelrestrs = parseSelrestr(npnode); | ||
1352 | } | ||
1353 | } | ||
1354 | |||
1355 | fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); | ||
1356 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB"))) | ||
1357 | { | ||
1358 | fr.push_back(part::createVerb()); | ||
1359 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP"))) | ||
1360 | { | ||
1361 | std::set<std::string> partChoices; | ||
1362 | bool partLiteral; | ||
1363 | |||
1364 | if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"))) | ||
1365 | { | ||
1366 | partLiteral = true; | ||
1367 | |||
1368 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1369 | std::string choicesStr = reinterpret_cast<const char*>(key); | ||
1370 | xmlFree(key); | ||
1371 | |||
1372 | split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); | ||
1373 | } else { | ||
1374 | partLiteral = false; | ||
1375 | |||
1376 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1377 | { | ||
1378 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1379 | { | ||
1380 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1381 | { | ||
1382 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1383 | { | ||
1384 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1385 | partChoices.insert(reinterpret_cast<const char*>(key)); | ||
1386 | xmlFree(key); | ||
1387 | } | ||
1388 | } | ||
1389 | } | ||
1390 | } | ||
1391 | } | ||
1392 | |||
1393 | fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); | ||
1394 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ"))) | ||
1395 | { | ||
1396 | fr.push_back(part::createAdjective()); | ||
1397 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV"))) | ||
1398 | { | ||
1399 | fr.push_back(part::createAdverb()); | ||
1400 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX"))) | ||
1401 | { | ||
1402 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1403 | std::string literalValue = reinterpret_cast<const char*>(key); | ||
1404 | xmlFree(key); | ||
1405 | |||
1406 | fr.push_back(part::createLiteral(literalValue)); | ||
1407 | } else { | ||
1408 | continue; | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | grp.addFrame(fr); | ||
1413 | } | ||
1414 | } | ||
1415 | } | ||
1416 | } | ||
2251 | } | 1417 | } |
2252 | } | 1418 | } |
2253 | } | ||
2254 | } | ||
2255 | |||
2256 | // syntax table | ||
2257 | { | ||
2258 | std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); | ||
2259 | if (!wnsyntaxfile.is_open()) | ||
2260 | { | ||
2261 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2262 | print_usage(); | ||
2263 | } | ||
2264 | 1419 | ||
2265 | std::list<std::string> lines; | 1420 | return grp; |
2266 | for (;;) | ||
2267 | { | ||
2268 | std::string line; | ||
2269 | if (!getline(wnsyntaxfile, line)) | ||
2270 | { | ||
2271 | break; | ||
2272 | } | ||
2273 | |||
2274 | if (line.back() == '\r') | ||
2275 | { | ||
2276 | line.pop_back(); | ||
2277 | } | ||
2278 | |||
2279 | lines.push_back(line); | ||
2280 | } | 1421 | } |
2281 | 1422 | ||
2282 | progress ppgs("Writing adjective syntax markers...", lines.size()); | 1423 | selrestr generator::parseSelrestr(xmlNodePtr top) |
2283 | for (auto line : lines) | ||
2284 | { | 1424 | { |
2285 | ppgs.update(); | 1425 | xmlChar* key; |
2286 | 1426 | ||
2287 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); | 1427 | if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) |
2288 | std::smatch relation_data; | ||
2289 | if (!std::regex_search(line, relation_data, relation)) | ||
2290 | { | ||
2291 | continue; | ||
2292 | } | ||
2293 | |||
2294 | int synset_id = stoi(relation_data[1]); | ||
2295 | int wnum = stoi(relation_data[2]); | ||
2296 | std::string syn = relation_data[3]; | ||
2297 | std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); | ||
2298 | |||
2299 | sqlite3_stmt* ppstmt; | ||
2300 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2301 | { | 1428 | { |
2302 | db_error(ppdb, query); | 1429 | if (xmlChildElementCount(top) == 0) |
2303 | } | 1430 | { |
2304 | 1431 | return {}; | |
2305 | sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); | 1432 | } else if (xmlChildElementCount(top) == 1) |
2306 | sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); | 1433 | { |
2307 | 1434 | return parseSelrestr(xmlFirstElementChild(top)); | |
2308 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1435 | } else { |
1436 | bool orlogic = false; | ||
1437 | if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic"))) | ||
1438 | { | ||
1439 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic")); | ||
1440 | if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or"))) | ||
1441 | { | ||
1442 | orlogic = true; | ||
1443 | } | ||
1444 | |||
1445 | xmlFree(key); | ||
1446 | } | ||
1447 | |||
1448 | std::list<selrestr> children; | ||
1449 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | ||
1450 | { | ||
1451 | if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS")) | ||
1452 | || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1453 | { | ||
1454 | children.push_back(parseSelrestr(selrestr)); | ||
1455 | } | ||
1456 | } | ||
1457 | |||
1458 | return selrestr(children, orlogic); | ||
1459 | } | ||
1460 | } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
2309 | { | 1461 | { |
2310 | db_error(ppdb, query); | 1462 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value")); |
1463 | bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+"); | ||
1464 | xmlFree(key); | ||
1465 | |||
1466 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type")); | ||
1467 | std::string selRestriction = reinterpret_cast<const char*>(key); | ||
1468 | xmlFree(key); | ||
1469 | |||
1470 | return selrestr(selRestriction, selPos); | ||
1471 | } else { | ||
1472 | throw std::logic_error("Badly formatted selrestr"); | ||
2311 | } | 1473 | } |
2312 | |||
2313 | sqlite3_finalize(ppstmt); | ||
2314 | } | 1474 | } |
2315 | } | 1475 | |
2316 | 1476 | }; | |
2317 | sqlite3_close_v2(ppdb); | 1477 | }; |
2318 | |||
2319 | std::cout << "Done." << std::endl; | ||
2320 | } | ||
diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h | |||
@@ -0,0 +1,151 @@ | |||
1 | #ifndef GENERATOR_H_5B61CBC5 | ||
2 | #define GENERATOR_H_5B61CBC5 | ||
3 | |||
4 | #include <string> | ||
5 | #include <map> | ||
6 | #include <list> | ||
7 | #include <set> | ||
8 | #include <libxml/parser.h> | ||
9 | #include "database.h" | ||
10 | #include "notion.h" | ||
11 | #include "word.h" | ||
12 | #include "lemma.h" | ||
13 | #include "form.h" | ||
14 | #include "pronunciation.h" | ||
15 | #include "group.h" | ||
16 | #include "frame.h" | ||
17 | |||
18 | namespace verbly { | ||
19 | namespace generator { | ||
20 | |||
21 | enum class part_of_speech; | ||
22 | class selrestr; | ||
23 | |||
24 | class generator { | ||
25 | public: | ||
26 | |||
27 | // Constructor | ||
28 | |||
29 | generator( | ||
30 | std::string verbNetPath, | ||
31 | std::string agidPath, | ||
32 | std::string wordNetPath, | ||
33 | std::string cmudictPath, | ||
34 | std::string imageNetPath, | ||
35 | std::string outputPath); | ||
36 | |||
37 | // Action | ||
38 | |||
39 | void run(); | ||
40 | |||
41 | private: | ||
42 | |||
43 | // Subroutines | ||
44 | |||
45 | void readWordNetSynsets(); | ||
46 | |||
47 | void readAdjectivePositioning(); | ||
48 | |||
49 | void readImageNetUrls(); | ||
50 | |||
51 | void readWordNetSenseKeys(); | ||
52 | |||
53 | void readVerbNet(); | ||
54 | |||
55 | void readAgidInflections(); | ||
56 | |||
57 | void readPrepositions(); | ||
58 | |||
59 | void readCmudictPronunciations(); | ||
60 | |||
61 | void writeSchema(); | ||
62 | |||
63 | void dumpObjects(); | ||
64 | |||
65 | void readWordNetAntonymy(); | ||
66 | |||
67 | void readWordNetVariation(); | ||
68 | |||
69 | void readWordNetClasses(); | ||
70 | |||
71 | void readWordNetCausality(); | ||
72 | |||
73 | void readWordNetEntailment(); | ||
74 | |||
75 | void readWordNetHypernymy(); | ||
76 | |||
77 | void readWordNetInstantiation(); | ||
78 | |||
79 | void readWordNetMemberMeronymy(); | ||
80 | |||
81 | void readWordNetPartMeronymy(); | ||
82 | |||
83 | void readWordNetSubstanceMeronymy(); | ||
84 | |||
85 | void readWordNetPertainymy(); | ||
86 | |||
87 | void readWordNetSpecification(); | ||
88 | |||
89 | void readWordNetSimilarity(); | ||
90 | |||
91 | // Helpers | ||
92 | |||
93 | std::list<std::string> readFile(std::string path); | ||
94 | |||
95 | inline part_of_speech partOfSpeechByWnid(int wnid); | ||
96 | |||
97 | notion& createNotion(part_of_speech partOfSpeech); | ||
98 | |||
99 | notion& lookupOrCreateNotion(int wnid); | ||
100 | |||
101 | lemma& lookupOrCreateLemma(std::string base_form); | ||
102 | |||
103 | form& lookupOrCreateForm(std::string text); | ||
104 | |||
105 | template <typename... Args> word& createWord(Args&&... args); | ||
106 | |||
107 | group& createGroup(xmlNodePtr top); | ||
108 | |||
109 | selrestr parseSelrestr(xmlNodePtr top); | ||
110 | |||
111 | // Input | ||
112 | |||
113 | std::string verbNetPath_; | ||
114 | std::string agidPath_; | ||
115 | std::string wordNetPath_; | ||
116 | std::string cmudictPath_; | ||
117 | std::string imageNetPath_; | ||
118 | |||
119 | // Output | ||
120 | |||
121 | database db_; | ||
122 | |||
123 | // Data | ||
124 | |||
125 | std::list<notion> notions_; | ||
126 | std::list<word> words_; | ||
127 | std::list<lemma> lemmas_; | ||
128 | std::list<form> forms_; | ||
129 | std::list<pronunciation> pronunciations_; | ||
130 | std::list<frame> frames_; | ||
131 | std::list<group> groups_; | ||
132 | |||
133 | // Indexes | ||
134 | |||
135 | std::map<int, notion*> notionByWnid_; | ||
136 | std::map<int, std::set<word*>> wordsByWnid_; | ||
137 | std::map<std::pair<int, int>, word*> wordByWnidAndWnum_; | ||
138 | std::map<std::string, std::set<word*>> wordsByBaseForm_; | ||
139 | std::map<std::string, lemma*> lemmaByBaseForm_; | ||
140 | std::map<std::string, form*> formByText_; | ||
141 | |||
142 | // Caches | ||
143 | |||
144 | std::map<std::string, word*> wnSenseKeys_; | ||
145 | |||
146 | }; | ||
147 | |||
148 | }; | ||
149 | }; | ||
150 | |||
151 | #endif /* end of include guard: GENERATOR_H_5B61CBC5 */ | ||
diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp | |||
@@ -0,0 +1,119 @@ | |||
1 | #include "group.h" | ||
2 | #include <stdexcept> | ||
3 | #include <list> | ||
4 | #include <json.hpp> | ||
5 | #include "database.h" | ||
6 | #include "field.h" | ||
7 | #include "frame.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | int group::nextId_ = 0; | ||
13 | |||
14 | group::group() : id_(nextId_++) | ||
15 | { | ||
16 | } | ||
17 | |||
18 | void group::setParent(const group& parent) | ||
19 | { | ||
20 | // Adding a group to itself is nonsensical. | ||
21 | assert(&parent != this); | ||
22 | |||
23 | parent_ = &parent; | ||
24 | } | ||
25 | |||
26 | void group::addRole(std::string name, role r) | ||
27 | { | ||
28 | roleNames_.insert(name); | ||
29 | roles_[name] = std::move(r); | ||
30 | } | ||
31 | |||
32 | void group::addFrame(const frame& f) | ||
33 | { | ||
34 | frames_.insert(&f); | ||
35 | } | ||
36 | |||
37 | std::set<std::string> group::getRoles() const | ||
38 | { | ||
39 | std::set<std::string> fullRoles = roleNames_; | ||
40 | |||
41 | if (hasParent()) | ||
42 | { | ||
43 | for (std::string name : getParent().getRoles()) | ||
44 | { | ||
45 | fullRoles.insert(name); | ||
46 | } | ||
47 | } | ||
48 | |||
49 | return fullRoles; | ||
50 | } | ||
51 | |||
52 | const role& group::getRole(std::string name) const | ||
53 | { | ||
54 | if (roles_.count(name)) | ||
55 | { | ||
56 | return roles_.at(name); | ||
57 | } else if (hasParent()) | ||
58 | { | ||
59 | return getParent().getRole(name); | ||
60 | } else { | ||
61 | throw std::invalid_argument("Specified role not found in verb group"); | ||
62 | } | ||
63 | } | ||
64 | |||
65 | std::set<const frame*> group::getFrames() const | ||
66 | { | ||
67 | std::set<const frame*> fullFrames = frames_; | ||
68 | |||
69 | if (hasParent()) | ||
70 | { | ||
71 | for (const frame* f : getParent().getFrames()) | ||
72 | { | ||
73 | fullFrames.insert(f); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | return fullFrames; | ||
78 | } | ||
79 | |||
80 | database& operator<<(database& db, const group& arg) | ||
81 | { | ||
82 | // Serialize the group first | ||
83 | { | ||
84 | std::list<field> fields; | ||
85 | fields.emplace_back("group_id", arg.getId()); | ||
86 | |||
87 | nlohmann::json jsonRoles; | ||
88 | for (std::string name : arg.getRoles()) | ||
89 | { | ||
90 | const role& r = arg.getRole(name); | ||
91 | |||
92 | nlohmann::json jsonRole; | ||
93 | jsonRole["type"] = name; | ||
94 | jsonRole["selrestrs"] = r.getSelrestrs().toJson(); | ||
95 | |||
96 | jsonRoles.emplace_back(std::move(jsonRole)); | ||
97 | } | ||
98 | |||
99 | fields.emplace_back("data", jsonRoles.dump()); | ||
100 | |||
101 | db.insertIntoTable("groups", std::move(fields)); | ||
102 | } | ||
103 | |||
104 | // Then, serialize the group/frame relationship | ||
105 | for (const frame* f : arg.getFrames()) | ||
106 | { | ||
107 | std::list<field> fields; | ||
108 | |||
109 | fields.emplace_back("group_id", arg.getId()); | ||
110 | fields.emplace_back("frame_id", f->getId()); | ||
111 | |||
112 | db.insertIntoTable("groups_frames", std::move(fields)); | ||
113 | } | ||
114 | |||
115 | return db; | ||
116 | } | ||
117 | |||
118 | }; | ||
119 | }; | ||
diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h | |||
@@ -0,0 +1,80 @@ | |||
1 | #ifndef GROUP_H_EDAFB5DC | ||
2 | #define GROUP_H_EDAFB5DC | ||
3 | |||
4 | #include <map> | ||
5 | #include <set> | ||
6 | #include <string> | ||
7 | #include <cassert> | ||
8 | #include "role.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | class frame; | ||
14 | class database; | ||
15 | |||
16 | class group { | ||
17 | public: | ||
18 | |||
19 | // Constructor | ||
20 | |||
21 | group(); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void setParent(const group& parent); | ||
26 | |||
27 | void addRole(std::string name, role r); | ||
28 | |||
29 | void addFrame(const frame& f); | ||
30 | |||
31 | // Accessors | ||
32 | |||
33 | int getId() const | ||
34 | { | ||
35 | return id_; | ||
36 | } | ||
37 | |||
38 | bool hasParent() const | ||
39 | { | ||
40 | return (parent_ != nullptr); | ||
41 | } | ||
42 | |||
43 | const group& getParent() const | ||
44 | { | ||
45 | // Calling code should always call hasParent first | ||
46 | assert(parent_ != nullptr); | ||
47 | |||
48 | return *parent_; | ||
49 | } | ||
50 | |||
51 | std::set<std::string> getRoles() const; | ||
52 | |||
53 | const role& getRole(std::string name) const; | ||
54 | |||
55 | std::set<const frame*> getFrames() const; | ||
56 | |||
57 | private: | ||
58 | |||
59 | static int nextId_; | ||
60 | |||
61 | const int id_; | ||
62 | |||
63 | const group* parent_ = nullptr; | ||
64 | std::map<std::string, role> roles_; | ||
65 | std::set<const frame*> frames_; | ||
66 | |||
67 | // Caches | ||
68 | |||
69 | std::set<std::string> roleNames_; | ||
70 | |||
71 | }; | ||
72 | |||
73 | // Serializer | ||
74 | |||
75 | database& operator<<(database& db, const group& arg); | ||
76 | |||
77 | }; | ||
78 | }; | ||
79 | |||
80 | #endif /* end of include guard: GROUP_H_EDAFB5DC */ | ||
diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp | |||
@@ -0,0 +1,65 @@ | |||
1 | #include "lemma.h" | ||
2 | #include <list> | ||
3 | #include <cassert> | ||
4 | #include "field.h" | ||
5 | #include "database.h" | ||
6 | #include "form.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | int lemma::nextId_ = 0; | ||
12 | |||
13 | lemma::lemma(const form& baseForm) : | ||
14 | id_(nextId_++), | ||
15 | baseForm_(baseForm) | ||
16 | { | ||
17 | inflections_[inflection::base] = {&baseForm}; | ||
18 | } | ||
19 | |||
20 | void lemma::addInflection(inflection type, const form& f) | ||
21 | { | ||
22 | // There can only be one base form. | ||
23 | assert(type != inflection::base); | ||
24 | |||
25 | inflections_[type].insert(&f); | ||
26 | } | ||
27 | |||
28 | std::set<const form*> lemma::getInflections(inflection type) const | ||
29 | { | ||
30 | if (inflections_.count(type)) | ||
31 | { | ||
32 | return inflections_.at(type); | ||
33 | } else { | ||
34 | return {}; | ||
35 | } | ||
36 | } | ||
37 | |||
38 | database& operator<<(database& db, const lemma& arg) | ||
39 | { | ||
40 | for (inflection type : { | ||
41 | inflection::base, | ||
42 | inflection::plural, | ||
43 | inflection::comparative, | ||
44 | inflection::superlative, | ||
45 | inflection::past_tense, | ||
46 | inflection::past_participle, | ||
47 | inflection::ing_form, | ||
48 | inflection::s_form}) | ||
49 | { | ||
50 | for (const form* f : arg.getInflections(type)) | ||
51 | { | ||
52 | std::list<field> fields; | ||
53 | fields.emplace_back("lemma_id", arg.getId()); | ||
54 | fields.emplace_back("form_id", f->getId()); | ||
55 | fields.emplace_back("category", static_cast<int>(type)); | ||
56 | |||
57 | db.insertIntoTable("lemmas_forms", std::move(fields)); | ||
58 | } | ||
59 | } | ||
60 | |||
61 | return db; | ||
62 | } | ||
63 | |||
64 | }; | ||
65 | }; | ||
diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h | |||
@@ -0,0 +1,58 @@ | |||
1 | #ifndef LEMMA_H_D73105A7 | ||
2 | #define LEMMA_H_D73105A7 | ||
3 | |||
4 | #include <string> | ||
5 | #include <map> | ||
6 | #include <set> | ||
7 | #include "enums.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | class database; | ||
13 | class form; | ||
14 | |||
15 | class lemma { | ||
16 | public: | ||
17 | |||
18 | // Constructors | ||
19 | |||
20 | explicit lemma(const form& baseForm); | ||
21 | |||
22 | // Mutators | ||
23 | |||
24 | void addInflection(inflection type, const form& f); | ||
25 | |||
26 | // Accessors | ||
27 | |||
28 | int getId() const | ||
29 | { | ||
30 | return id_; | ||
31 | } | ||
32 | |||
33 | const form& getBaseForm() const | ||
34 | { | ||
35 | return baseForm_; | ||
36 | } | ||
37 | |||
38 | std::set<const form*> getInflections(inflection type) const; | ||
39 | |||
40 | private: | ||
41 | |||
42 | static int nextId_; | ||
43 | |||
44 | const int id_; | ||
45 | const form& baseForm_; | ||
46 | |||
47 | std::map<inflection, std::set<const form*>> inflections_; | ||
48 | |||
49 | }; | ||
50 | |||
51 | // Serializer | ||
52 | |||
53 | database& operator<<(database& db, const lemma& arg); | ||
54 | |||
55 | }; | ||
56 | }; | ||
57 | |||
58 | #endif /* end of include guard: LEMMA_H_D73105A7 */ | ||
diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp | |||
@@ -0,0 +1,40 @@ | |||
1 | #include <iostream> | ||
2 | #include <exception> | ||
3 | #include "generator.h" | ||
4 | |||
5 | void printUsage() | ||
6 | { | ||
7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl; | ||
8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; | ||
9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | ||
10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; | ||
11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | ||
12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; | ||
13 | std::cout << "output :: datafile output path" << std::endl; | ||
14 | } | ||
15 | |||
16 | int main(int argc, char** argv) | ||
17 | { | ||
18 | if (argc == 7) | ||
19 | { | ||
20 | try | ||
21 | { | ||
22 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); | ||
23 | |||
24 | try | ||
25 | { | ||
26 | app.run(); | ||
27 | } catch (const std::exception& e) | ||
28 | { | ||
29 | std::cout << e.what() << std::endl; | ||
30 | } | ||
31 | } catch (const std::exception& e) | ||
32 | { | ||
33 | std::cout << e.what() << std::endl; | ||
34 | printUsage(); | ||
35 | } | ||
36 | } else { | ||
37 | std::cout << "verbly datafile generator" << std::endl; | ||
38 | printUsage(); | ||
39 | } | ||
40 | } | ||
diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp | |||
@@ -0,0 +1,85 @@ | |||
1 | #include "notion.h" | ||
2 | #include <string> | ||
3 | #include <list> | ||
4 | #include "database.h" | ||
5 | #include "field.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | int notion::nextId_ = 0; | ||
11 | |||
12 | notion::notion( | ||
13 | part_of_speech partOfSpeech) : | ||
14 | id_(nextId_++), | ||
15 | partOfSpeech_(partOfSpeech) | ||
16 | { | ||
17 | } | ||
18 | |||
19 | notion::notion( | ||
20 | part_of_speech partOfSpeech, | ||
21 | int wnid) : | ||
22 | id_(nextId_++), | ||
23 | partOfSpeech_(partOfSpeech), | ||
24 | wnid_(wnid), | ||
25 | hasWnid_(true) | ||
26 | { | ||
27 | } | ||
28 | |||
29 | void notion::incrementNumOfImages() | ||
30 | { | ||
31 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
32 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
33 | |||
34 | numOfImages_++; | ||
35 | } | ||
36 | |||
37 | void notion::setPrepositionGroups(std::list<std::string> groups) | ||
38 | { | ||
39 | // Calling code should always check that the notion is a preposition first. | ||
40 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
41 | |||
42 | prepositionGroups_ = groups; | ||
43 | } | ||
44 | |||
45 | database& operator<<(database& db, const notion& arg) | ||
46 | { | ||
47 | // First, serialize the notion | ||
48 | { | ||
49 | std::list<field> fields; | ||
50 | |||
51 | fields.emplace_back("notion_id", arg.getId()); | ||
52 | fields.emplace_back("part_of_speech", static_cast<int>(arg.getPartOfSpeech())); | ||
53 | |||
54 | if (arg.hasWnid()) | ||
55 | { | ||
56 | fields.emplace_back("wnid", arg.getWnid()); | ||
57 | |||
58 | if (arg.getPartOfSpeech() == part_of_speech::noun) | ||
59 | { | ||
60 | fields.emplace_back("images", arg.getNumOfImages()); | ||
61 | } | ||
62 | } | ||
63 | |||
64 | db.insertIntoTable("notions", std::move(fields)); | ||
65 | } | ||
66 | |||
67 | // Next, serialize the is_a relationship if this is a preposition | ||
68 | if (arg.getPartOfSpeech() == part_of_speech::preposition) | ||
69 | { | ||
70 | for (std::string group : arg.getPrepositionGroups()) | ||
71 | { | ||
72 | std::list<field> fields; | ||
73 | |||
74 | fields.emplace_back("notion_id", arg.getId()); | ||
75 | fields.emplace_back("groupname", group); | ||
76 | |||
77 | db.insertIntoTable("is_a", std::move(fields)); | ||
78 | } | ||
79 | } | ||
80 | |||
81 | return db; | ||
82 | } | ||
83 | |||
84 | }; | ||
85 | }; | ||
diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h | |||
@@ -0,0 +1,91 @@ | |||
1 | #ifndef NOTION_H_221DE2BC | ||
2 | #define NOTION_H_221DE2BC | ||
3 | |||
4 | #include <cassert> | ||
5 | #include <list> | ||
6 | #include <string> | ||
7 | #include "enums.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | class database; | ||
13 | |||
14 | class notion { | ||
15 | public: | ||
16 | |||
17 | // Constructors | ||
18 | |||
19 | explicit notion(part_of_speech partOfSpeech); | ||
20 | |||
21 | notion(part_of_speech partOfSpeech, int wnid); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void incrementNumOfImages(); | ||
26 | |||
27 | void setPrepositionGroups(std::list<std::string> groups); | ||
28 | |||
29 | // Accessors | ||
30 | |||
31 | int getId() const | ||
32 | { | ||
33 | return id_; | ||
34 | } | ||
35 | |||
36 | part_of_speech getPartOfSpeech() const | ||
37 | { | ||
38 | return partOfSpeech_; | ||
39 | } | ||
40 | |||
41 | bool hasWnid() const | ||
42 | { | ||
43 | return hasWnid_; | ||
44 | } | ||
45 | |||
46 | int getWnid() const | ||
47 | { | ||
48 | // Calling code should always call hasWnid first. | ||
49 | assert(hasWnid_); | ||
50 | |||
51 | return wnid_; | ||
52 | } | ||
53 | |||
54 | int getNumOfImages() const | ||
55 | { | ||
56 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
57 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
58 | |||
59 | return numOfImages_; | ||
60 | } | ||
61 | |||
62 | std::list<std::string> getPrepositionGroups() const | ||
63 | { | ||
64 | // Calling code should always check that the notion is a preposition first. | ||
65 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
66 | |||
67 | return prepositionGroups_; | ||
68 | } | ||
69 | |||
70 | private: | ||
71 | |||
72 | static int nextId_; | ||
73 | |||
74 | const int id_; | ||
75 | const part_of_speech partOfSpeech_; | ||
76 | const int wnid_ = 0; | ||
77 | const bool hasWnid_ = false; | ||
78 | |||
79 | int numOfImages_ = 0; | ||
80 | std::list<std::string> prepositionGroups_; | ||
81 | |||
82 | }; | ||
83 | |||
84 | // Serializer | ||
85 | |||
86 | database& operator<<(database& db, const notion& arg); | ||
87 | |||
88 | }; | ||
89 | }; | ||
90 | |||
91 | #endif /* end of include guard: NOTION_H_221DE2BC */ | ||
diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp | |||
@@ -0,0 +1,336 @@ | |||
1 | #include "part.h" | ||
2 | #include <stdexcept> | ||
3 | #include "selrestr.h" | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | part part::createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs) | ||
9 | { | ||
10 | part p(type::noun_phrase); | ||
11 | |||
12 | new(&p.noun_phrase_.role) std::string(std::move(role)); | ||
13 | new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs)); | ||
14 | new(&p.noun_phrase_.synrestrs) std::set<std::string>(std::move(synrestrs)); | ||
15 | |||
16 | return p; | ||
17 | } | ||
18 | |||
19 | part part::createVerb() | ||
20 | { | ||
21 | return part(type::verb); | ||
22 | } | ||
23 | |||
24 | part part::createPreposition(std::set<std::string> choices, bool literal) | ||
25 | { | ||
26 | part p(type::preposition); | ||
27 | |||
28 | new(&p.preposition_.choices) std::set<std::string>(std::move(choices)); | ||
29 | p.preposition_.literal = literal; | ||
30 | |||
31 | return p; | ||
32 | } | ||
33 | |||
34 | part part::createAdjective() | ||
35 | { | ||
36 | return part(type::adjective); | ||
37 | } | ||
38 | |||
39 | part part::createAdverb() | ||
40 | { | ||
41 | return part(type::adverb); | ||
42 | } | ||
43 | |||
44 | part part::createLiteral(std::string value) | ||
45 | { | ||
46 | part p(type::literal); | ||
47 | |||
48 | new(&p.literal_) std::string(std::move(value)); | ||
49 | |||
50 | return p; | ||
51 | } | ||
52 | |||
53 | part::part(const part& other) | ||
54 | { | ||
55 | type_ = other.type_; | ||
56 | |||
57 | switch (type_) | ||
58 | { | ||
59 | case type::noun_phrase: | ||
60 | { | ||
61 | new(&noun_phrase_.role) std::string(other.noun_phrase_.role); | ||
62 | new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs); | ||
63 | new(&noun_phrase_.synrestrs) std::set<std::string>(other.noun_phrase_.synrestrs); | ||
64 | |||
65 | break; | ||
66 | } | ||
67 | |||
68 | case type::preposition: | ||
69 | { | ||
70 | new(&preposition_.choices) std::set<std::string>(other.preposition_.choices); | ||
71 | preposition_.literal = other.preposition_.literal; | ||
72 | |||
73 | break; | ||
74 | } | ||
75 | |||
76 | case type::literal: | ||
77 | { | ||
78 | new(&literal_) std::string(other.literal_); | ||
79 | |||
80 | break; | ||
81 | } | ||
82 | |||
83 | case type::verb: | ||
84 | case type::adjective: | ||
85 | case type::adverb: | ||
86 | case type::invalid: | ||
87 | { | ||
88 | break; | ||
89 | } | ||
90 | } | ||
91 | } | ||
92 | |||
93 | part::part(part&& other) : part() | ||
94 | { | ||
95 | swap(*this, other); | ||
96 | } | ||
97 | |||
98 | part& part::operator=(part other) | ||
99 | { | ||
100 | swap(*this, other); | ||
101 | |||
102 | return *this; | ||
103 | } | ||
104 | |||
105 | void swap(part& first, part& second) | ||
106 | { | ||
107 | using type = part::type; | ||
108 | |||
109 | type tempType = first.type_; | ||
110 | std::string tempRole; | ||
111 | selrestr tempSelrestrs; | ||
112 | std::set<std::string> tempSynrestrs; | ||
113 | std::set<std::string> tempChoices; | ||
114 | bool tempPrepLiteral; | ||
115 | std::string tempLiteralValue; | ||
116 | |||
117 | switch (tempType) | ||
118 | { | ||
119 | case type::noun_phrase: | ||
120 | { | ||
121 | tempRole = std::move(first.noun_phrase_.role); | ||
122 | tempSelrestrs = std::move(first.noun_phrase_.selrestrs); | ||
123 | tempSynrestrs = std::move(first.noun_phrase_.synrestrs); | ||
124 | |||
125 | break; | ||
126 | } | ||
127 | |||
128 | case type::preposition: | ||
129 | { | ||
130 | tempChoices = std::move(first.preposition_.choices); | ||
131 | tempPrepLiteral = first.preposition_.literal; | ||
132 | |||
133 | break; | ||
134 | } | ||
135 | |||
136 | case type::literal: | ||
137 | { | ||
138 | tempLiteralValue = std::move(first.literal_); | ||
139 | |||
140 | break; | ||
141 | } | ||
142 | |||
143 | case type::verb: | ||
144 | case type::adjective: | ||
145 | case type::adverb: | ||
146 | case type::invalid: | ||
147 | { | ||
148 | break; | ||
149 | } | ||
150 | } | ||
151 | |||
152 | first.~part(); | ||
153 | |||
154 | first.type_ = second.type_; | ||
155 | |||
156 | switch (first.type_) | ||
157 | { | ||
158 | case type::noun_phrase: | ||
159 | { | ||
160 | new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role)); | ||
161 | new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs)); | ||
162 | new(&first.noun_phrase_.synrestrs) std::set<std::string>(std::move(second.noun_phrase_.synrestrs)); | ||
163 | |||
164 | break; | ||
165 | } | ||
166 | |||
167 | case type::preposition: | ||
168 | { | ||
169 | new(&first.preposition_.choices) std::set<std::string>(std::move(second.preposition_.choices)); | ||
170 | first.preposition_.literal = second.preposition_.literal; | ||
171 | |||
172 | break; | ||
173 | } | ||
174 | |||
175 | case type::literal: | ||
176 | { | ||
177 | new(&first.literal_) std::string(std::move(second.literal_)); | ||
178 | |||
179 | break; | ||
180 | } | ||
181 | |||
182 | case type::verb: | ||
183 | case type::adjective: | ||
184 | case type::adverb: | ||
185 | case type::invalid: | ||
186 | { | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | second.~part(); | ||
192 | |||
193 | second.type_ = tempType; | ||
194 | |||
195 | switch (second.type_) | ||
196 | { | ||
197 | case type::noun_phrase: | ||
198 | { | ||
199 | new(&second.noun_phrase_.role) std::string(std::move(tempRole)); | ||
200 | new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs)); | ||
201 | new(&second.noun_phrase_.synrestrs) std::set<std::string>(std::move(tempSynrestrs)); | ||
202 | |||
203 | break; | ||
204 | } | ||
205 | |||
206 | case type::preposition: | ||
207 | { | ||
208 | new(&second.preposition_.choices) std::set<std::string>(std::move(tempChoices)); | ||
209 | second.preposition_.literal = tempPrepLiteral; | ||
210 | |||
211 | break; | ||
212 | } | ||
213 | |||
214 | case type::literal: | ||
215 | { | ||
216 | new(&second.literal_) std::string(std::move(tempLiteralValue)); | ||
217 | |||
218 | break; | ||
219 | } | ||
220 | |||
221 | case type::verb: | ||
222 | case type::adjective: | ||
223 | case type::adverb: | ||
224 | case type::invalid: | ||
225 | { | ||
226 | break; | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | |||
231 | part::~part() | ||
232 | { | ||
233 | switch (type_) | ||
234 | { | ||
235 | case type::noun_phrase: | ||
236 | { | ||
237 | using string_type = std::string; | ||
238 | using set_type = std::set<std::string>; | ||
239 | |||
240 | noun_phrase_.role.~string_type(); | ||
241 | noun_phrase_.selrestrs.~selrestr(); | ||
242 | noun_phrase_.synrestrs.~set_type(); | ||
243 | |||
244 | break; | ||
245 | } | ||
246 | |||
247 | case type::preposition: | ||
248 | { | ||
249 | using set_type = std::set<std::string>; | ||
250 | |||
251 | preposition_.choices.~set_type(); | ||
252 | |||
253 | break; | ||
254 | } | ||
255 | |||
256 | case type::literal: | ||
257 | { | ||
258 | using string_type = std::string; | ||
259 | |||
260 | literal_.~string_type(); | ||
261 | |||
262 | break; | ||
263 | } | ||
264 | |||
265 | case type::verb: | ||
266 | case type::adjective: | ||
267 | case type::adverb: | ||
268 | case type::invalid: | ||
269 | { | ||
270 | break; | ||
271 | } | ||
272 | } | ||
273 | } | ||
274 | |||
275 | std::string part::getNounRole() const | ||
276 | { | ||
277 | if (type_ == type::noun_phrase) | ||
278 | { | ||
279 | return noun_phrase_.role; | ||
280 | } else { | ||
281 | throw std::domain_error("part::getNounRole is only valid for noun phrase parts"); | ||
282 | } | ||
283 | } | ||
284 | |||
285 | selrestr part::getNounSelrestrs() const | ||
286 | { | ||
287 | if (type_ == type::noun_phrase) | ||
288 | { | ||
289 | return noun_phrase_.selrestrs; | ||
290 | } else { | ||
291 | throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts"); | ||
292 | } | ||
293 | } | ||
294 | |||
295 | std::set<std::string> part::getNounSynrestrs() const | ||
296 | { | ||
297 | if (type_ == type::noun_phrase) | ||
298 | { | ||
299 | return noun_phrase_.synrestrs; | ||
300 | } else { | ||
301 | throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts"); | ||
302 | } | ||
303 | } | ||
304 | |||
305 | std::set<std::string> part::getPrepositionChoices() const | ||
306 | { | ||
307 | if (type_ == type::preposition) | ||
308 | { | ||
309 | return preposition_.choices; | ||
310 | } else { | ||
311 | throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts"); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | bool part::isPrepositionLiteral() const | ||
316 | { | ||
317 | if (type_ == type::preposition) | ||
318 | { | ||
319 | return preposition_.literal; | ||
320 | } else { | ||
321 | throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts"); | ||
322 | } | ||
323 | } | ||
324 | |||
325 | std::string part::getLiteralValue() const | ||
326 | { | ||
327 | if (type_ == type::literal) | ||
328 | { | ||
329 | return literal_; | ||
330 | } else { | ||
331 | throw std::domain_error("part::getLiteralValue is only valid for literal parts"); | ||
332 | } | ||
333 | } | ||
334 | |||
335 | }; | ||
336 | }; | ||
diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h | |||
@@ -0,0 +1,114 @@ | |||
1 | #ifndef PART_H_FB54F361 | ||
2 | #define PART_H_FB54F361 | ||
3 | |||
4 | #include <string> | ||
5 | #include <set> | ||
6 | #include "selrestr.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | class part { | ||
12 | public: | ||
13 | enum class type { | ||
14 | invalid = -1, | ||
15 | noun_phrase = 0, | ||
16 | verb = 1, | ||
17 | preposition = 2, | ||
18 | adjective = 3, | ||
19 | adverb = 4, | ||
20 | literal = 5 | ||
21 | }; | ||
22 | |||
23 | // Static factories | ||
24 | |||
25 | static part createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs); | ||
26 | |||
27 | static part createVerb(); | ||
28 | |||
29 | static part createPreposition(std::set<std::string> choices, bool literal); | ||
30 | |||
31 | static part createAdjective(); | ||
32 | |||
33 | static part createAdverb(); | ||
34 | |||
35 | static part createLiteral(std::string value); | ||
36 | |||
37 | // Copy and move constructors | ||
38 | |||
39 | part(const part& other); | ||
40 | |||
41 | part(part&& other); | ||
42 | |||
43 | // Assignment | ||
44 | |||
45 | part& operator=(part other); | ||
46 | |||
47 | // Swap | ||
48 | |||
49 | friend void swap(part& first, part& second); | ||
50 | |||
51 | // Destructor | ||
52 | |||
53 | ~part(); | ||
54 | |||
55 | // General accessors | ||
56 | |||
57 | type getType() const | ||
58 | { | ||
59 | return type_; | ||
60 | } | ||
61 | |||
62 | // Noun phrase accessors | ||
63 | |||
64 | std::string getNounRole() const; | ||
65 | |||
66 | selrestr getNounSelrestrs() const; | ||
67 | |||
68 | std::set<std::string> getNounSynrestrs() const; | ||
69 | |||
70 | // Preposition accessors | ||
71 | |||
72 | std::set<std::string> getPrepositionChoices() const; | ||
73 | |||
74 | bool isPrepositionLiteral() const; | ||
75 | |||
76 | // Literal accessors | ||
77 | |||
78 | std::string getLiteralValue() const; | ||
79 | |||
80 | private: | ||
81 | |||
82 | // Private constructors | ||
83 | |||
84 | part() | ||
85 | { | ||
86 | } | ||
87 | |||
88 | part(type t) : type_(t) | ||
89 | { | ||
90 | } | ||
91 | |||
92 | // Data | ||
93 | |||
94 | union { | ||
95 | struct { | ||
96 | std::string role; | ||
97 | selrestr selrestrs; | ||
98 | std::set<std::string> synrestrs; | ||
99 | } noun_phrase_; | ||
100 | struct { | ||
101 | std::set<std::string> choices; | ||
102 | bool literal; | ||
103 | } preposition_; | ||
104 | std::string literal_; | ||
105 | }; | ||
106 | |||
107 | type type_ = type::invalid; | ||
108 | |||
109 | }; | ||
110 | |||
111 | }; | ||
112 | }; | ||
113 | |||
114 | #endif /* end of include guard: PART_H_FB54F361 */ | ||
diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h | |||
@@ -3,48 +3,54 @@ | |||
3 | 3 | ||
4 | #include <string> | 4 | #include <string> |
5 | 5 | ||
6 | class progress { | 6 | namespace verbly { |
7 | private: | 7 | namespace generator { |
8 | std::string message; | ||
9 | int total; | ||
10 | int cur = 0; | ||
11 | int lprint = 0; | ||
12 | 8 | ||
13 | public: | 9 | class progress { |
14 | progress(std::string message, int total) : message(message), total(total) | 10 | private: |
15 | { | 11 | std::string message; |
16 | std::cout << message << " 0%" << std::flush; | 12 | int total; |
17 | } | 13 | int cur = 0; |
14 | int lprint = 0; | ||
18 | 15 | ||
19 | void update(int val) | 16 | public: |
20 | { | 17 | progress(std::string message, int total) : message(message), total(total) |
21 | if (val <= total) | 18 | { |
22 | { | 19 | std::cout << message << " 0%" << std::flush; |
23 | cur = val; | 20 | } |
24 | } else { | 21 | |
25 | cur = total; | 22 | void update(int val) |
26 | } | 23 | { |
24 | if (val <= total) | ||
25 | { | ||
26 | cur = val; | ||
27 | } else { | ||
28 | cur = total; | ||
29 | } | ||
27 | 30 | ||
28 | int pp = cur * 100 / total; | 31 | int pp = cur * 100 / total; |
29 | if (pp != lprint) | 32 | if (pp != lprint) |
30 | { | 33 | { |
31 | lprint = pp; | 34 | lprint = pp; |
32 | 35 | ||
33 | std::cout << "\b\b\b\b" << std::right; | 36 | std::cout << "\b\b\b\b" << std::right; |
34 | std::cout.width(3); | 37 | std::cout.width(3); |
35 | std::cout << pp << "%" << std::flush; | 38 | std::cout << pp << "%" << std::flush; |
36 | } | 39 | } |
37 | } | 40 | } |
41 | |||
42 | void update() | ||
43 | { | ||
44 | update(cur+1); | ||
45 | } | ||
38 | 46 | ||
39 | void update() | 47 | ~progress() |
40 | { | 48 | { |
41 | update(cur+1); | 49 | std::cout << "\b\b\b\b100%" << std::endl; |
42 | } | 50 | } |
51 | }; | ||
43 | 52 | ||
44 | ~progress() | 53 | }; |
45 | { | ||
46 | std::cout << "\b\b\b\b100%" << std::endl; | ||
47 | } | ||
48 | }; | 54 | }; |
49 | 55 | ||
50 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ | 56 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ |
diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp | |||
@@ -0,0 +1,87 @@ | |||
1 | #include "pronunciation.h" | ||
2 | #include <list> | ||
3 | #include <algorithm> | ||
4 | #include <cctype> | ||
5 | #include <iterator> | ||
6 | #include "database.h" | ||
7 | #include "field.h" | ||
8 | #include "../lib/util.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | int pronunciation::nextId_ = 0; | ||
14 | |||
15 | pronunciation::pronunciation(std::string phonemes) : | ||
16 | id_(nextId_++), | ||
17 | phonemes_(phonemes) | ||
18 | { | ||
19 | auto phonemeList = split<std::list<std::string>>(phonemes, " "); | ||
20 | |||
21 | auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) { | ||
22 | return phoneme.find("1") != std::string::npos; | ||
23 | }); | ||
24 | |||
25 | // Rhyme detection | ||
26 | if (rhymeStart != std::end(phonemeList)) | ||
27 | { | ||
28 | std::list<std::string> rhymePhonemes; | ||
29 | |||
30 | std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) { | ||
31 | std::string naked; | ||
32 | |||
33 | std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) { | ||
34 | return std::isdigit(ch); | ||
35 | }); | ||
36 | |||
37 | return naked; | ||
38 | }); | ||
39 | |||
40 | rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); | ||
41 | |||
42 | if (rhymeStart != std::begin(phonemeList)) | ||
43 | { | ||
44 | prerhyme_ = *std::prev(rhymeStart); | ||
45 | } | ||
46 | } | ||
47 | |||
48 | // Syllable/stress | ||
49 | for (std::string phoneme : phonemeList) | ||
50 | { | ||
51 | if (std::isdigit(phoneme.back())) | ||
52 | { | ||
53 | // It's a vowel! | ||
54 | syllables_++; | ||
55 | |||
56 | if (phoneme.back() == '1') | ||
57 | { | ||
58 | stress_.push_back('1'); | ||
59 | } else { | ||
60 | stress_.push_back('0'); | ||
61 | } | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | database& operator<<(database& db, const pronunciation& arg) | ||
67 | { | ||
68 | std::list<field> fields; | ||
69 | |||
70 | fields.emplace_back("pronunciation_id", arg.getId()); | ||
71 | fields.emplace_back("phonemes", arg.getPhonemes()); | ||
72 | fields.emplace_back("syllables", arg.getSyllables()); | ||
73 | fields.emplace_back("stress", arg.getStress()); | ||
74 | |||
75 | if (arg.hasRhyme()) | ||
76 | { | ||
77 | fields.emplace_back("rhyme", arg.getRhymePhonemes()); | ||
78 | fields.emplace_back("prerhyme", arg.getPrerhyme()); | ||
79 | } | ||
80 | |||
81 | db.insertIntoTable("pronunciations", std::move(fields)); | ||
82 | |||
83 | return db; | ||
84 | } | ||
85 | |||
86 | }; | ||
87 | }; | ||
diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h | |||
@@ -0,0 +1,82 @@ | |||
1 | #ifndef PRONUNCIATION_H_584A08DD | ||
2 | #define PRONUNCIATION_H_584A08DD | ||
3 | |||
4 | #include <string> | ||
5 | #include <cassert> | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class database; | ||
11 | |||
12 | class pronunciation { | ||
13 | public: | ||
14 | |||
15 | // Constructor | ||
16 | |||
17 | explicit pronunciation(std::string phonemes); | ||
18 | |||
19 | // Accessors | ||
20 | |||
21 | int getId() const | ||
22 | { | ||
23 | return id_; | ||
24 | } | ||
25 | |||
26 | std::string getPhonemes() const | ||
27 | { | ||
28 | return phonemes_; | ||
29 | } | ||
30 | |||
31 | bool hasRhyme() const | ||
32 | { | ||
33 | return !rhyme_.empty(); | ||
34 | } | ||
35 | |||
36 | std::string getRhymePhonemes() const | ||
37 | { | ||
38 | // Calling code should always call hasRhyme first. | ||
39 | assert(!rhyme_.empty()); | ||
40 | |||
41 | return rhyme_; | ||
42 | } | ||
43 | |||
44 | std::string getPrerhyme() const | ||
45 | { | ||
46 | // Calling code should always call hasRhyme first. | ||
47 | assert(!rhyme_.empty()); | ||
48 | |||
49 | return prerhyme_; | ||
50 | } | ||
51 | |||
52 | int getSyllables() const | ||
53 | { | ||
54 | return syllables_; | ||
55 | } | ||
56 | |||
57 | std::string getStress() const | ||
58 | { | ||
59 | return stress_; | ||
60 | } | ||
61 | |||
62 | private: | ||
63 | |||
64 | static int nextId_; | ||
65 | |||
66 | const int id_; | ||
67 | const std::string phonemes_; | ||
68 | std::string rhyme_; | ||
69 | std::string prerhyme_; | ||
70 | int syllables_ = 0; | ||
71 | std::string stress_; | ||
72 | |||
73 | }; | ||
74 | |||
75 | // Serializer | ||
76 | |||
77 | database& operator<<(database& db, const pronunciation& arg); | ||
78 | |||
79 | }; | ||
80 | }; | ||
81 | |||
82 | #endif /* end of include guard: PRONUNCIATION_H_584A08DD */ | ||
diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h | |||
@@ -0,0 +1,35 @@ | |||
1 | #ifndef ROLE_H_249F9A9C | ||
2 | #define ROLE_H_249F9A9C | ||
3 | |||
4 | #include "selrestr.h" | ||
5 | |||
6 | namespace verbly { | ||
7 | namespace generator { | ||
8 | |||
9 | class role { | ||
10 | public: | ||
11 | |||
12 | // Mutators | ||
13 | |||
14 | void setSelrestrs(selrestr selrestrs) | ||
15 | { | ||
16 | selrestrs_ = selrestrs; | ||
17 | } | ||
18 | |||
19 | // Accessors | ||
20 | |||
21 | const selrestr& getSelrestrs() const | ||
22 | { | ||
23 | return selrestrs_; | ||
24 | } | ||
25 | |||
26 | private: | ||
27 | |||
28 | selrestr selrestrs_; | ||
29 | |||
30 | }; | ||
31 | |||
32 | }; | ||
33 | }; | ||
34 | |||
35 | #endif /* end of include guard: ROLE_H_249F9A9C */ | ||
diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
@@ -1,286 +1,204 @@ | |||
1 | DROP TABLE IF EXISTS `verbs`; | 1 | CREATE TABLE `notions` ( |
2 | CREATE TABLE `verbs` ( | 2 | `notion_id` INTEGER PRIMARY KEY, |
3 | `verb_id` INTEGER PRIMARY KEY, | 3 | `part_of_speech` SMALLINT NOT NULL, |
4 | `infinitive` VARCHAR(32) NOT NULL, | 4 | `wnid` INTEGER, |
5 | `past_tense` VARCHAR(32) NOT NULL, | 5 | `images` INTEGER |
6 | `past_participle` VARCHAR(32) NOT NULL, | ||
7 | `ing_form` VARCHAR(32) NOT NULL, | ||
8 | `s_form` VARCHAR(32) NOT NULL | ||
9 | ); | 6 | ); |
10 | 7 | ||
11 | DROP TABLE IF EXISTS `groups`; | 8 | CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`); |
12 | CREATE TABLE `groups` ( | ||
13 | `group_id` INTEGER PRIMARY KEY, | ||
14 | `data` BLOB NOT NULL | ||
15 | ); | ||
16 | |||
17 | DROP TABLE IF EXISTS `frames`; | ||
18 | CREATE TABLE `frames` ( | ||
19 | `frame_id` INTEGER PRIMARY KEY, | ||
20 | `group_id` INTEGER NOT NULL, | ||
21 | `data` BLOB NOT NULL, | ||
22 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
23 | ); | ||
24 | 9 | ||
25 | DROP TABLE IF EXISTS `verb_groups`; | ||
26 | CREATE TABLE `verb_groups` ( | ||
27 | `verb_id` INTEGER NOT NULL, | ||
28 | `group_id` INTEGER NOT NULL, | ||
29 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`), | ||
30 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
31 | ); | ||
32 | |||
33 | DROP TABLE IF EXISTS `adjectives`; | ||
34 | CREATE TABLE `adjectives` ( | ||
35 | `adjective_id` INTEGER PRIMARY KEY, | ||
36 | `base_form` VARCHAR(32) NOT NULL, | ||
37 | `comparative` VARCHAR(32), | ||
38 | `superlative` VARCHAR(32), | ||
39 | `position` CHAR(1), | ||
40 | `complexity` INTEGER NOT NULL | ||
41 | ); | ||
42 | |||
43 | DROP TABLE IF EXISTS `adverbs`; | ||
44 | CREATE TABLE `adverbs` ( | ||
45 | `adverb_id` INTEGER PRIMARY KEY, | ||
46 | `base_form` VARCHAR(32) NOT NULL, | ||
47 | `comparative` VARCHAR(32), | ||
48 | `superlative` VARCHAR(32), | ||
49 | `complexity` INTEGER NOT NULL | ||
50 | ); | ||
51 | |||
52 | DROP TABLE IF EXISTS `nouns`; | ||
53 | CREATE TABLE `nouns` ( | ||
54 | `noun_id` INTEGER PRIMARY KEY, | ||
55 | `singular` VARCHAR(32) NOT NULL, | ||
56 | `plural` VARCHAR(32), | ||
57 | `proper` INTEGER(1) NOT NULL, | ||
58 | `complexity` INTEGER NOT NULL, | ||
59 | `images` INTEGER NOT NULL, | ||
60 | `wnid` INTEGER NOT NULL | ||
61 | ); | ||
62 | |||
63 | DROP TABLE IF EXISTS `hypernymy`; | ||
64 | CREATE TABLE `hypernymy` ( | 10 | CREATE TABLE `hypernymy` ( |
65 | `hypernym_id` INTEGER NOT NULL, | 11 | `hypernym_id` INTEGER NOT NULL, |
66 | `hyponym_id` INTEGER NOT NULL, | 12 | `hyponym_id` INTEGER NOT NULL |
67 | FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`), | ||
68 | FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`) | ||
69 | ); | 13 | ); |
70 | 14 | ||
71 | DROP TABLE IF EXISTS `instantiation`; | 15 | CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`); |
16 | CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`); | ||
17 | |||
72 | CREATE TABLE `instantiation` ( | 18 | CREATE TABLE `instantiation` ( |
73 | `class_id` INTEGER NOT NULL, | 19 | `class_id` INTEGER NOT NULL, |
74 | `instance_id` INTEGER NOT NULL, | 20 | `instance_id` INTEGER NOT NULL |
75 | FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`), | ||
76 | FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`) | ||
77 | ); | 21 | ); |
78 | 22 | ||
79 | DROP TABLE IF EXISTS `member_meronymy`; | 23 | CREATE INDEX `instance_of` ON `instantiation`(`class_id`); |
24 | CREATE INDEX `class_of` ON `instantiation`(`instance_id`); | ||
25 | |||
80 | CREATE TABLE `member_meronymy` ( | 26 | CREATE TABLE `member_meronymy` ( |
81 | `meronym_id` INTEGER NOT NULL, | 27 | `meronym_id` INTEGER NOT NULL, |
82 | `holonym_id` INTEGER NOT NULL, | 28 | `holonym_id` INTEGER NOT NULL |
83 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
84 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
85 | ); | 29 | ); |
86 | 30 | ||
87 | DROP TABLE IF EXISTS `part_meronymy`; | 31 | CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`); |
32 | CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`); | ||
33 | |||
88 | CREATE TABLE `part_meronymy` ( | 34 | CREATE TABLE `part_meronymy` ( |
89 | `meronym_id` INTEGER NOT NULL, | 35 | `meronym_id` INTEGER NOT NULL, |
90 | `holonym_id` INTEGER NOT NULL, | 36 | `holonym_id` INTEGER NOT NULL |
91 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
92 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
93 | ); | 37 | ); |
94 | 38 | ||
95 | DROP TABLE IF EXISTS `substance_meronymy`; | 39 | CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`); |
40 | CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`); | ||
41 | |||
96 | CREATE TABLE `substance_meronymy` ( | 42 | CREATE TABLE `substance_meronymy` ( |
97 | `meronym_id` INTEGER NOT NULL, | 43 | `meronym_id` INTEGER NOT NULL, |
98 | `holonym_id` INTEGER NOT NULL, | 44 | `holonym_id` INTEGER NOT NULL |
99 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
100 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
101 | ); | 45 | ); |
102 | 46 | ||
103 | DROP TABLE IF EXISTS `variation`; | 47 | CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`); |
48 | CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`); | ||
49 | |||
104 | CREATE TABLE `variation` ( | 50 | CREATE TABLE `variation` ( |
105 | `noun_id` INTEGER NOT NULL, | 51 | `noun_id` INTEGER NOT NULL, |
106 | `adjective_id` INTEGER NOT NULL, | 52 | `adjective_id` INTEGER NOT NULL |
107 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
108 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
109 | ); | 53 | ); |
110 | 54 | ||
111 | DROP TABLE IF EXISTS `noun_antonymy`; | 55 | CREATE INDEX `variant_of` ON `variation`(`noun_id`); |
112 | CREATE TABLE `noun_antonymy` ( | 56 | CREATE INDEX `attribute_of` ON `variation`(`adjective_id`); |
113 | `noun_1_id` INTEGER NOT NULL, | ||
114 | `noun_2_id` INTEGER NOT NULL, | ||
115 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | ||
116 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
117 | ); | ||
118 | 57 | ||
119 | DROP TABLE IF EXISTS `adjective_antonymy`; | 58 | CREATE TABLE `similarity` ( |
120 | CREATE TABLE `adjective_antonymy` ( | ||
121 | `adjective_1_id` INTEGER NOT NULL, | 59 | `adjective_1_id` INTEGER NOT NULL, |
122 | `adjective_2_id` INTEGER NOT NULL, | 60 | `adjective_2_id` INTEGER NOT NULL |
123 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 61 | ); |
124 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 62 | |
63 | CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`); | ||
64 | |||
65 | CREATE TABLE `is_a` ( | ||
66 | `notion_id` INTEGER NOT NULL, | ||
67 | `groupname` VARCHAR(32) NOT NULL | ||
125 | ); | 68 | ); |
126 | 69 | ||
127 | DROP TABLE IF EXISTS `adverb_antonymy`; | 70 | CREATE TABLE `entailment` ( |
128 | CREATE TABLE `adverb_antonymy` ( | 71 | `given_id` INTEGER NOT NULL, |
129 | `adverb_1_id` INTEGER NOT NULL, | 72 | `entailment_id` INTEGER NOT NULL |
130 | `adverb_2_id` INTEGER NOT NULL, | 73 | ); |
131 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 74 | |
132 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 75 | CREATE INDEX `entailment_of` ON `entailment`(`given_id`); |
76 | CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`); | ||
77 | |||
78 | CREATE TABLE `causality` ( | ||
79 | `cause_id` INTEGER NOT NULL, | ||
80 | `effect_id` INTEGER NOT NULL | ||
81 | ); | ||
82 | |||
83 | CREATE INDEX `effect_of` ON `causality`(`cause_id`); | ||
84 | CREATE INDEX `cause_of` ON `causality`(`effect_id`); | ||
85 | |||
86 | CREATE TABLE `words` ( | ||
87 | `word_id` INTEGER PRIMARY KEY, | ||
88 | `notion_id` INTEGER NOT NULL, | ||
89 | `lemma_id` INTEGER NOT NULL, | ||
90 | `tag_count` INTEGER, | ||
91 | `position` SMALLINT, | ||
92 | `group_id` INTEGER | ||
93 | ); | ||
94 | |||
95 | CREATE INDEX `notion_words` ON `words`(`notion_id`); | ||
96 | CREATE INDEX `lemma_words` ON `words`(`lemma_id`); | ||
97 | CREATE INDEX `group_words` ON `words`(`group_id`); | ||
98 | |||
99 | CREATE TABLE `antonymy` ( | ||
100 | `antonym_1_id` INTEGER NOT NULL, | ||
101 | `antonym_2_id` INTEGER NOT NULL | ||
133 | ); | 102 | ); |
134 | 103 | ||
135 | DROP TABLE IF EXISTS `specification`; | 104 | CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`); |
105 | |||
136 | CREATE TABLE `specification` ( | 106 | CREATE TABLE `specification` ( |
137 | `general_id` INTEGER NOT NULL, | 107 | `general_id` INTEGER NOT NULL, |
138 | `specific_id` INTEGER NOT NULL, | 108 | `specific_id` INTEGER NOT NULL |
139 | FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`), | ||
140 | FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`) | ||
141 | ); | 109 | ); |
142 | 110 | ||
143 | DROP TABLE IF EXISTS `pertainymy`; | 111 | CREATE INDEX `specification_of` ON `specification`(`general_id`); |
112 | CREATE INDEX `generalization_of` ON `specification`(`specific_id`); | ||
113 | |||
144 | CREATE TABLE `pertainymy` ( | 114 | CREATE TABLE `pertainymy` ( |
145 | `noun_id` INTEGER NOT NULL, | 115 | `noun_id` INTEGER NOT NULL, |
146 | `pertainym_id` INTEGER NOT NULL, | 116 | `pertainym_id` INTEGER NOT NULL |
147 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
148 | FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`) | ||
149 | ); | 117 | ); |
150 | 118 | ||
151 | DROP TABLE IF EXISTS `mannernymy`; | 119 | CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`); |
120 | CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`); | ||
121 | |||
152 | CREATE TABLE `mannernymy` ( | 122 | CREATE TABLE `mannernymy` ( |
153 | `adjective_id` INTEGER NOT NULL, | 123 | `adjective_id` INTEGER NOT NULL, |
154 | `mannernym_id` INTEGER NOT NULL, | 124 | `mannernym_id` INTEGER NOT NULL |
155 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
156 | FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`) | ||
157 | ); | 125 | ); |
158 | 126 | ||
159 | DROP TABLE IF EXISTS `noun_synonymy`; | 127 | CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`); |
160 | CREATE TABLE `noun_synonymy` ( | 128 | CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`); |
161 | `noun_1_id` INTEGER NOT NULL, | ||
162 | `noun_2_id` INTEGER NOT NULL, | ||
163 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`), | ||
164 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`) | ||
165 | ); | ||
166 | 129 | ||
167 | DROP TABLE IF EXISTS `adjective_synonymy`; | 130 | CREATE TABLE `usage` ( |
168 | CREATE TABLE `adjective_synonymy` ( | 131 | `domain_id` INTEGER NOT NULL, |
169 | `adjective_1_id` INTEGER NOT NULL, | 132 | `term_id` INTEGER NOT NULL |
170 | `adjective_2_id` INTEGER NOT NULL, | ||
171 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | ||
172 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | ||
173 | ); | 133 | ); |
174 | 134 | ||
175 | DROP TABLE IF EXISTS `adverb_synonymy`; | 135 | CREATE INDEX `usage_term_of` ON `usage`(`domain_id`); |
176 | CREATE TABLE `adverb_synonymy` ( | 136 | CREATE INDEX `usage_domain_of` ON `usage`(`term_id`); |
177 | `adverb_1_id` INTEGER NOT NULL, | ||
178 | `adverb_2_id` INTEGER NOT NULL, | ||
179 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | ||
180 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | ||
181 | ); | ||
182 | 137 | ||
183 | DROP TABLE IF EXISTS `noun_pronunciations`; | 138 | CREATE TABLE `topicality` ( |
184 | CREATE TABLE `noun_pronunciations` ( | 139 | `domain_id` INTEGER NOT NULL, |
185 | `noun_id` INTEGER NOT NULL, | 140 | `term_id` INTEGER NOT NULL |
186 | `pronunciation` VARCHAR(64) NOT NULL, | ||
187 | `prerhyme` VARCHAR(8), | ||
188 | `rhyme` VARCHAR(64), | ||
189 | `syllables` INT NOT NULL, | ||
190 | `stress` VARCHAR(64) NOT NULL, | ||
191 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`) | ||
192 | ); | 141 | ); |
193 | 142 | ||
194 | DROP TABLE IF EXISTS `verb_pronunciations`; | 143 | CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`); |
195 | CREATE TABLE `verb_pronunciations` ( | 144 | CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`); |
196 | `verb_id` INTEGER NOT NULL, | ||
197 | `pronunciation` VARCHAR(64) NOT NULL, | ||
198 | `prerhyme` VARCHAR(8), | ||
199 | `rhyme` VARCHAR(64), | ||
200 | `syllables` INT NOT NULL, | ||
201 | `stress` VARCHAR(64) NOT NULL, | ||
202 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`) | ||
203 | ); | ||
204 | 145 | ||
205 | DROP TABLE IF EXISTS `adjective_pronunciations`; | 146 | CREATE TABLE `regionality` ( |
206 | CREATE TABLE `adjective_pronunciations` ( | 147 | `domain_id` INTEGER NOT NULL, |
207 | `adjective_id` INTEGER NOT NULL, | 148 | `term_id` INTEGER NOT NULL |
208 | `pronunciation` VARCHAR(64) NOT NULL, | ||
209 | `prerhyme` VARCHAR(8), | ||
210 | `rhyme` VARCHAR(64), | ||
211 | `syllables` INT NOT NULL, | ||
212 | `stress` VARCHAR(64) NOT NULL, | ||
213 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
214 | ); | 149 | ); |
215 | 150 | ||
216 | DROP TABLE IF EXISTS `adverb_pronunciations`; | 151 | CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`); |
217 | CREATE TABLE `adverb_pronunciations` ( | 152 | CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`); |
218 | `adverb_id` INTEGER NOT NULL, | ||
219 | `pronunciation` VARCHAR(64) NOT NULL, | ||
220 | `prerhyme` VARCHAR(8), | ||
221 | `rhyme` VARCHAR(64), | ||
222 | `syllables` INT NOT NULL, | ||
223 | `stress` VARCHAR(64) NOT NULL, | ||
224 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
225 | ); | ||
226 | 153 | ||
227 | DROP TABLE IF EXISTS `noun_noun_derivation`; | 154 | CREATE TABLE `forms` ( |
228 | CREATE TABLE `noun_noun_derivation` ( | 155 | `form_id` INTEGER PRIMARY KEY, |
229 | `noun_1_id` INTEGER NOT NULL, | 156 | `form` VARCHAR(32) NOT NULL, |
230 | `noun_2_id` INTEGER NOT NULL, | 157 | `complexity` SMALLINT NOT NULL, |
231 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | 158 | `proper` SMALLINT NOT NULL |
232 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
233 | ); | 159 | ); |
234 | 160 | ||
235 | DROP TABLE IF EXISTS `noun_adjective_derivation`; | 161 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); |
236 | CREATE TABLE `noun_adjective_derivation` ( | ||
237 | `noun_id` INTEGER NOT NULL, | ||
238 | `adjective_id` INTEGER NOT NULL, | ||
239 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
240 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
241 | ); | ||
242 | 162 | ||
243 | DROP TABLE IF EXISTS `noun_adverb_derivation`; | 163 | CREATE TABLE `lemmas_forms` ( |
244 | CREATE TABLE `noun_adverb_derivation` ( | 164 | `lemma_id` INTEGER NOT NULL, |
245 | `noun_id` INTEGER NOT NULL, | 165 | `form_id` INTEGER NOT NULL, |
246 | `adverb_id` INTEGER NOT NULL, | 166 | `category` SMALLINT NOT NULL |
247 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
248 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
249 | ); | 167 | ); |
250 | 168 | ||
251 | DROP TABLE IF EXISTS `adjective_adjective_derivation`; | 169 | CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`); |
252 | CREATE TABLE `adjective_adjective_derivation` ( | 170 | CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`); |
253 | `adjective_1_id` INTEGER NOT NULL, | 171 | |
254 | `adjective_2_id` INTEGER NOT NULL, | 172 | CREATE TABLE `pronunciations` ( |
255 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 173 | `pronunciation_id` INTEGER PRIMARY KEY, |
256 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 174 | `phonemes` VARCHAR(64) NOT NULL, |
175 | `prerhyme` VARCHAR(8), | ||
176 | `rhyme` VARCHAR(64), | ||
177 | `syllables` INTEGER NOT NULL, | ||
178 | `stress` VARCHAR(64) NOT NULL | ||
257 | ); | 179 | ); |
258 | 180 | ||
259 | DROP TABLE IF EXISTS `adjective_adverb_derivation`; | 181 | CREATE TABLE `forms_pronunciations` ( |
260 | CREATE TABLE `adjective_adverb_derivation` ( | 182 | `form_id` INTEGER NOT NULL, |
261 | `adjective_id` INTEGER NOT NULL, | 183 | `pronunciation_id` INTEGER NOT NULL |
262 | `adverb_id` INTEGER NOT NULL, | ||
263 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
264 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`) | ||
265 | ); | 184 | ); |
266 | 185 | ||
267 | DROP TABLE IF EXISTS `adverb_adverb_derivation`; | 186 | CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`); |
268 | CREATE TABLE `adverb_adverb_derivation` ( | 187 | CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`); |
269 | `adverb_1_id` INTEGER NOT NULL, | 188 | |
270 | `adverb_2_id` INTEGER NOT NULL, | 189 | CREATE TABLE `groups` ( |
271 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 190 | `group_id` INTEGER PRIMARY KEY, |
272 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 191 | `data` BLOB NOT NULL |
273 | ); | 192 | ); |
274 | 193 | ||
275 | DROP TABLE IF EXISTS `prepositions`; | 194 | CREATE TABLE `frames` ( |
276 | CREATE TABLE `prepositions` ( | 195 | `frame_id` INTEGER PRIMARY KEY, |
277 | `preposition_id` INTEGER PRIMARY KEY, | 196 | `data` BLOB NOT NULL |
278 | `form` VARCHAR(32) NOT NULL | ||
279 | ); | 197 | ); |
280 | 198 | ||
281 | DROP TABLE IF EXISTS `preposition_groups`; | 199 | CREATE TABLE `groups_frames` ( |
282 | CREATE TABLE `preposition_groups` ( | 200 | `group_id` INTEGER NOT NULL, |
283 | `preposition_id` INTEGER NOT NULL, | 201 | `frame_id` INTEGER NOT NULL |
284 | `groupname` VARCHAR(32) NOT NULL, | ||
285 | FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`) | ||
286 | ); | 202 | ); |
203 | |||
204 | CREATE INDEX `frames_in` ON `groups_frames`(`group_id`); | ||
diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp | |||
@@ -0,0 +1,288 @@ | |||
1 | #include "selrestr.h" | ||
2 | |||
3 | namespace verbly { | ||
4 | namespace generator { | ||
5 | |||
6 | selrestr::selrestr(const selrestr& other) | ||
7 | { | ||
8 | type_ = other.type_; | ||
9 | |||
10 | switch (type_) | ||
11 | { | ||
12 | case type::singleton: | ||
13 | { | ||
14 | singleton_.pos = other.singleton_.pos; | ||
15 | new(&singleton_.restriction) std::string(other.singleton_.restriction); | ||
16 | |||
17 | break; | ||
18 | } | ||
19 | |||
20 | case type::group: | ||
21 | { | ||
22 | new(&group_.children) std::list<selrestr>(other.group_.children); | ||
23 | group_.orlogic = other.group_.orlogic; | ||
24 | |||
25 | break; | ||
26 | } | ||
27 | |||
28 | case type::empty: | ||
29 | { | ||
30 | break; | ||
31 | } | ||
32 | } | ||
33 | } | ||
34 | |||
35 | selrestr::selrestr(selrestr&& other) : selrestr() | ||
36 | { | ||
37 | swap(*this, other); | ||
38 | } | ||
39 | |||
40 | selrestr& selrestr::operator=(selrestr other) | ||
41 | { | ||
42 | swap(*this, other); | ||
43 | |||
44 | return *this; | ||
45 | } | ||
46 | |||
47 | void swap(selrestr& first, selrestr& second) | ||
48 | { | ||
49 | using type = selrestr::type; | ||
50 | |||
51 | type tempType = first.type_; | ||
52 | int tempPos; | ||
53 | std::string tempRestriction; | ||
54 | std::list<selrestr> tempChildren; | ||
55 | bool tempOrlogic; | ||
56 | |||
57 | switch (tempType) | ||
58 | { | ||
59 | case type::singleton: | ||
60 | { | ||
61 | tempPos = first.singleton_.pos; | ||
62 | tempRestriction = std::move(first.singleton_.restriction); | ||
63 | |||
64 | break; | ||
65 | } | ||
66 | |||
67 | case type::group: | ||
68 | { | ||
69 | tempChildren = std::move(first.group_.children); | ||
70 | tempOrlogic = first.group_.orlogic; | ||
71 | |||
72 | break; | ||
73 | } | ||
74 | |||
75 | case type::empty: | ||
76 | { | ||
77 | break; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | first.~selrestr(); | ||
82 | |||
83 | first.type_ = second.type_; | ||
84 | |||
85 | switch (first.type_) | ||
86 | { | ||
87 | case type::singleton: | ||
88 | { | ||
89 | first.singleton_.pos = second.singleton_.pos; | ||
90 | new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction)); | ||
91 | |||
92 | break; | ||
93 | } | ||
94 | |||
95 | case type::group: | ||
96 | { | ||
97 | new(&first.group_.children) std::list<selrestr>(std::move(second.group_.children)); | ||
98 | first.group_.orlogic = second.group_.orlogic; | ||
99 | |||
100 | break; | ||
101 | } | ||
102 | |||
103 | case type::empty: | ||
104 | { | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | second.~selrestr(); | ||
110 | |||
111 | second.type_ = tempType; | ||
112 | |||
113 | switch (second.type_) | ||
114 | { | ||
115 | case type::singleton: | ||
116 | { | ||
117 | second.singleton_.pos = tempPos; | ||
118 | new(&second.singleton_.restriction) std::string(std::move(tempRestriction)); | ||
119 | |||
120 | break; | ||
121 | } | ||
122 | |||
123 | case type::group: | ||
124 | { | ||
125 | new(&second.group_.children) std::list<selrestr>(std::move(tempChildren)); | ||
126 | second.group_.orlogic = tempOrlogic; | ||
127 | |||
128 | break; | ||
129 | } | ||
130 | |||
131 | case type::empty: | ||
132 | { | ||
133 | break; | ||
134 | } | ||
135 | } | ||
136 | } | ||
137 | |||
138 | selrestr::~selrestr() | ||
139 | { | ||
140 | switch (type_) | ||
141 | { | ||
142 | case type::singleton: | ||
143 | { | ||
144 | using string_type = std::string; | ||
145 | singleton_.restriction.~string_type(); | ||
146 | |||
147 | break; | ||
148 | } | ||
149 | |||
150 | case type::group: | ||
151 | { | ||
152 | using list_type = std::list<selrestr>; | ||
153 | group_.children.~list_type(); | ||
154 | |||
155 | break; | ||
156 | } | ||
157 | |||
158 | case type::empty: | ||
159 | { | ||
160 | break; | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | selrestr::selrestr() : type_(type::empty) | ||
166 | { | ||
167 | } | ||
168 | |||
169 | selrestr::selrestr( | ||
170 | std::string restriction, | ||
171 | bool pos) : | ||
172 | type_(type::singleton) | ||
173 | { | ||
174 | new(&singleton_.restriction) std::string(std::move(restriction)); | ||
175 | singleton_.pos = pos; | ||
176 | } | ||
177 | |||
178 | std::string selrestr::getRestriction() const | ||
179 | { | ||
180 | if (type_ == type::singleton) | ||
181 | { | ||
182 | return singleton_.restriction; | ||
183 | } else { | ||
184 | throw std::domain_error("Only singleton selrestrs have restrictions"); | ||
185 | } | ||
186 | } | ||
187 | |||
188 | bool selrestr::getPos() const | ||
189 | { | ||
190 | if (type_ == type::singleton) | ||
191 | { | ||
192 | return singleton_.pos; | ||
193 | } else { | ||
194 | throw std::domain_error("Only singleton selrestrs have positivity flags"); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | selrestr::selrestr( | ||
199 | std::list<selrestr> children, | ||
200 | bool orlogic) : | ||
201 | type_(type::group) | ||
202 | { | ||
203 | new(&group_.children) std::list<selrestr>(std::move(children)); | ||
204 | group_.orlogic = orlogic; | ||
205 | } | ||
206 | |||
207 | std::list<selrestr> selrestr::getChildren() const | ||
208 | { | ||
209 | if (type_ == type::group) | ||
210 | { | ||
211 | return group_.children; | ||
212 | } else { | ||
213 | throw std::domain_error("Only group selrestrs have children"); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | std::list<selrestr>::const_iterator selrestr::begin() const | ||
218 | { | ||
219 | if (type_ == type::group) | ||
220 | { | ||
221 | return std::begin(group_.children); | ||
222 | } else { | ||
223 | throw std::domain_error("Only group selrestrs have children"); | ||
224 | } | ||
225 | } | ||
226 | |||
227 | std::list<selrestr>::const_iterator selrestr::end() const | ||
228 | { | ||
229 | if (type_ == type::group) | ||
230 | { | ||
231 | return std::end(group_.children); | ||
232 | } else { | ||
233 | throw std::domain_error("Only group selrestrs have children"); | ||
234 | } | ||
235 | } | ||
236 | |||
237 | bool selrestr::getOrlogic() const | ||
238 | { | ||
239 | if (type_ == type::group) | ||
240 | { | ||
241 | return group_.orlogic; | ||
242 | } else { | ||
243 | throw std::domain_error("Only group selrestrs have logic"); | ||
244 | } | ||
245 | } | ||
246 | |||
247 | nlohmann::json selrestr::toJson() const | ||
248 | { | ||
249 | switch (type_) | ||
250 | { | ||
251 | case type::empty: | ||
252 | { | ||
253 | return {}; | ||
254 | } | ||
255 | |||
256 | case type::singleton: | ||
257 | { | ||
258 | return { | ||
259 | {"type", singleton_.restriction}, | ||
260 | {"pos", singleton_.pos} | ||
261 | }; | ||
262 | } | ||
263 | |||
264 | case type::group: | ||
265 | { | ||
266 | std::string logic; | ||
267 | if (group_.orlogic) | ||
268 | { | ||
269 | logic = "or"; | ||
270 | } else { | ||
271 | logic = "and"; | ||
272 | } | ||
273 | |||
274 | std::list<nlohmann::json> children; | ||
275 | std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) { | ||
276 | return child.toJson(); | ||
277 | }); | ||
278 | |||
279 | return { | ||
280 | {"logic", logic}, | ||
281 | {"children", children} | ||
282 | }; | ||
283 | } | ||
284 | } | ||
285 | } | ||
286 | |||
287 | }; | ||
288 | }; | ||
diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h | |||
@@ -0,0 +1,88 @@ | |||
1 | #ifndef SELRESTR_H_50652FB7 | ||
2 | #define SELRESTR_H_50652FB7 | ||
3 | |||
4 | #include <list> | ||
5 | #include <string> | ||
6 | #include <json.hpp> | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | class selrestr { | ||
12 | public: | ||
13 | enum class type { | ||
14 | empty, | ||
15 | singleton, | ||
16 | group | ||
17 | }; | ||
18 | |||
19 | // Copy and move constructors | ||
20 | |||
21 | selrestr(const selrestr& other); | ||
22 | selrestr(selrestr&& other); | ||
23 | |||
24 | // Assignment | ||
25 | |||
26 | selrestr& operator=(selrestr other); | ||
27 | |||
28 | // Swap | ||
29 | |||
30 | friend void swap(selrestr& first, selrestr& second); | ||
31 | |||
32 | // Destructor | ||
33 | |||
34 | ~selrestr(); | ||
35 | |||
36 | // Generic accessors | ||
37 | |||
38 | type getType() const | ||
39 | { | ||
40 | return type_; | ||
41 | } | ||
42 | |||
43 | // Empty | ||
44 | |||
45 | selrestr(); | ||
46 | |||
47 | // Singleton | ||
48 | |||
49 | selrestr(std::string restriction, bool pos); | ||
50 | |||
51 | std::string getRestriction() const; | ||
52 | |||
53 | bool getPos() const; | ||
54 | |||
55 | // Group | ||
56 | |||
57 | selrestr(std::list<selrestr> children, bool orlogic); | ||
58 | |||
59 | std::list<selrestr> getChildren() const; | ||
60 | |||
61 | std::list<selrestr>::const_iterator begin() const; | ||
62 | |||
63 | std::list<selrestr>::const_iterator end() const; | ||
64 | |||
65 | bool getOrlogic() const; | ||
66 | |||
67 | // Helpers | ||
68 | |||
69 | nlohmann::json toJson() const; | ||
70 | |||
71 | private: | ||
72 | union { | ||
73 | struct { | ||
74 | bool pos; | ||
75 | std::string restriction; | ||
76 | } singleton_; | ||
77 | struct { | ||
78 | std::list<selrestr> children; | ||
79 | bool orlogic; | ||
80 | } group_; | ||
81 | }; | ||
82 | type type_; | ||
83 | }; | ||
84 | |||
85 | }; | ||
86 | }; | ||
87 | |||
88 | #endif /* end of include guard: SELRESTR_H_50652FB7 */ | ||
diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp | |||
@@ -0,0 +1,77 @@ | |||
1 | #include "word.h" | ||
2 | #include <list> | ||
3 | #include <string> | ||
4 | #include "database.h" | ||
5 | #include "notion.h" | ||
6 | #include "lemma.h" | ||
7 | #include "field.h" | ||
8 | #include "group.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | int word::nextId_ = 0; | ||
14 | |||
15 | word::word( | ||
16 | notion& n, | ||
17 | lemma& l) : | ||
18 | id_(nextId_++), | ||
19 | notion_(n), | ||
20 | lemma_(l) | ||
21 | { | ||
22 | } | ||
23 | |||
24 | word::word( | ||
25 | notion& n, | ||
26 | lemma& l, | ||
27 | int tagCount) : | ||
28 | id_(nextId_++), | ||
29 | notion_(n), | ||
30 | lemma_(l), | ||
31 | tagCount_(tagCount), | ||
32 | hasTagCount_(true) | ||
33 | { | ||
34 | } | ||
35 | |||
36 | void word::setAdjectivePosition(positioning adjectivePosition) | ||
37 | { | ||
38 | adjectivePosition_ = adjectivePosition; | ||
39 | } | ||
40 | |||
41 | void word::setVerbGroup(const group& verbGroup) | ||
42 | { | ||
43 | verbGroup_ = &verbGroup; | ||
44 | } | ||
45 | |||
46 | database& operator<<(database& db, const word& arg) | ||
47 | { | ||
48 | std::list<field> fields; | ||
49 | |||
50 | fields.emplace_back("word_id", arg.getId()); | ||
51 | fields.emplace_back("notion_id", arg.getNotion().getId()); | ||
52 | fields.emplace_back("lemma_id", arg.getLemma().getId()); | ||
53 | |||
54 | if (arg.hasTagCount()) | ||
55 | { | ||
56 | fields.emplace_back("tag_count", arg.getTagCount()); | ||
57 | } | ||
58 | |||
59 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective) | ||
60 | && (arg.getAdjectivePosition() != positioning::undefined)) | ||
61 | { | ||
62 | fields.emplace_back("position", static_cast<int>(arg.getAdjectivePosition())); | ||
63 | } | ||
64 | |||
65 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb) | ||
66 | && (arg.hasVerbGroup())) | ||
67 | { | ||
68 | fields.emplace_back("group_id", arg.getVerbGroup().getId()); | ||
69 | } | ||
70 | |||
71 | db.insertIntoTable("words", std::move(fields)); | ||
72 | |||
73 | return db; | ||
74 | } | ||
75 | |||
76 | }; | ||
77 | }; | ||
diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h | |||
@@ -0,0 +1,110 @@ | |||
1 | #ifndef WORD_H_91F99D46 | ||
2 | #define WORD_H_91F99D46 | ||
3 | |||
4 | #include <cassert> | ||
5 | #include "enums.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class notion; | ||
11 | class lemma; | ||
12 | class database; | ||
13 | class group; | ||
14 | |||
15 | class word { | ||
16 | public: | ||
17 | |||
18 | // Constructors | ||
19 | |||
20 | word(notion& n, lemma& l); | ||
21 | |||
22 | word(notion& n, lemma& l, int tagCount); | ||
23 | |||
24 | // Mutators | ||
25 | |||
26 | void setAdjectivePosition(positioning adjectivePosition); | ||
27 | |||
28 | void setVerbGroup(const group& verbGroup); | ||
29 | |||
30 | // Accessors | ||
31 | |||
32 | int getId() const | ||
33 | { | ||
34 | return id_; | ||
35 | } | ||
36 | |||
37 | notion& getNotion() | ||
38 | { | ||
39 | return notion_; | ||
40 | } | ||
41 | |||
42 | const notion& getNotion() const | ||
43 | { | ||
44 | return notion_; | ||
45 | } | ||
46 | |||
47 | lemma& getLemma() | ||
48 | { | ||
49 | return lemma_; | ||
50 | } | ||
51 | |||
52 | const lemma& getLemma() const | ||
53 | { | ||
54 | return lemma_; | ||
55 | } | ||
56 | |||
57 | bool hasTagCount() const | ||
58 | { | ||
59 | return hasTagCount_; | ||
60 | } | ||
61 | |||
62 | int getTagCount() const | ||
63 | { | ||
64 | // Calling code should always call hasTagCount first. | ||
65 | assert(hasTagCount_); | ||
66 | |||
67 | return tagCount_; | ||
68 | } | ||
69 | |||
70 | positioning getAdjectivePosition() const | ||
71 | { | ||
72 | return adjectivePosition_; | ||
73 | } | ||
74 | |||
75 | bool hasVerbGroup() const | ||
76 | { | ||
77 | return (verbGroup_ != nullptr); | ||
78 | } | ||
79 | |||
80 | const group& getVerbGroup() const | ||
81 | { | ||
82 | // Calling code should always call hasVerbGroup first. | ||
83 | assert(verbGroup_ != nullptr); | ||
84 | |||
85 | return *verbGroup_; | ||
86 | } | ||
87 | |||
88 | private: | ||
89 | |||
90 | static int nextId_; | ||
91 | |||
92 | const int id_; | ||
93 | notion& notion_; | ||
94 | lemma& lemma_; | ||
95 | const int tagCount_ = 0; | ||
96 | const bool hasTagCount_ = false; | ||
97 | |||
98 | positioning adjectivePosition_ = positioning::undefined; | ||
99 | const group* verbGroup_ = nullptr; | ||
100 | |||
101 | }; | ||
102 | |||
103 | // Serializer | ||
104 | |||
105 | database& operator<<(database& db, const word& arg); | ||
106 | |||
107 | }; | ||
108 | }; | ||
109 | |||
110 | #endif /* end of include guard: WORD_H_91F99D46 */ | ||