diff options
Diffstat (limited to 'generator')
-rw-r--r-- | generator/CMakeLists.txt | 6 | ||||
-rw-r--r-- | generator/database.cpp | 173 | ||||
-rw-r--r-- | generator/database.h | 73 | ||||
-rw-r--r-- | generator/field.cpp | 193 | ||||
-rw-r--r-- | generator/field.h | 76 | ||||
-rw-r--r-- | generator/form.cpp | 53 | ||||
-rw-r--r-- | generator/form.h | 71 | ||||
-rw-r--r-- | generator/frame.cpp | 83 | ||||
-rw-r--r-- | generator/frame.h | 59 | ||||
-rw-r--r-- | generator/generator.cpp | 3145 | ||||
-rw-r--r-- | generator/generator.h | 151 | ||||
-rw-r--r-- | generator/group.cpp | 119 | ||||
-rw-r--r-- | generator/group.h | 80 | ||||
-rw-r--r-- | generator/lemma.cpp | 65 | ||||
-rw-r--r-- | generator/lemma.h | 58 | ||||
-rw-r--r-- | generator/main.cpp | 40 | ||||
-rw-r--r-- | generator/notion.cpp | 85 | ||||
-rw-r--r-- | generator/notion.h | 91 | ||||
-rw-r--r-- | generator/part.cpp | 336 | ||||
-rw-r--r-- | generator/part.h | 114 | ||||
-rw-r--r-- | generator/progress.h | 78 | ||||
-rw-r--r-- | generator/pronunciation.cpp | 87 | ||||
-rw-r--r-- | generator/pronunciation.h | 82 | ||||
-rw-r--r-- | generator/role.h | 35 | ||||
-rw-r--r-- | generator/schema.sql | 352 | ||||
-rw-r--r-- | generator/selrestr.cpp | 288 | ||||
-rw-r--r-- | generator/selrestr.h | 88 | ||||
-rw-r--r-- | generator/word.cpp | 77 | ||||
-rw-r--r-- | generator/word.h | 110 |
29 files changed, 4018 insertions, 2250 deletions
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt | |||
@@ -1,12 +1,12 @@ | |||
1 | cmake_minimum_required (VERSION 2.6) | 1 | cmake_minimum_required (VERSION 3.1) |
2 | project (generator) | 2 | project (generator) |
3 | 3 | ||
4 | find_package(PkgConfig) | 4 | find_package(PkgConfig) |
5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) | 5 | pkg_check_modules(sqlite3 sqlite3 REQUIRED) |
6 | find_package(libxml2 REQUIRED) | 6 | find_package(libxml2 REQUIRED) |
7 | 7 | ||
8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) | 8 | include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json) |
9 | add_executable(generator generator.cpp) | 9 | add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp) |
10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) | 10 | set_property(TARGET generator PROPERTY CXX_STANDARD 11) |
11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) | 11 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) |
12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) | 12 | target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) |
diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp | |||
@@ -0,0 +1,173 @@ | |||
1 | #include "database.h" | ||
2 | #include <sqlite3.h> | ||
3 | #include <cassert> | ||
4 | #include <fstream> | ||
5 | #include <stdexcept> | ||
6 | #include <cstdio> | ||
7 | #include <sstream> | ||
8 | #include "field.h" | ||
9 | #include "../lib/util.h" | ||
10 | |||
11 | namespace verbly { | ||
12 | namespace generator { | ||
13 | |||
14 | sqlite3_error::sqlite3_error( | ||
15 | const std::string& what, | ||
16 | const std::string& db_err) : | ||
17 | what_(what + " (" + db_err + ")"), | ||
18 | db_err_(db_err) | ||
19 | { | ||
20 | } | ||
21 | |||
22 | const char* sqlite3_error::what() const noexcept | ||
23 | { | ||
24 | return what_.c_str(); | ||
25 | } | ||
26 | |||
27 | const char* sqlite3_error::db_err() const noexcept | ||
28 | { | ||
29 | return db_err_.c_str(); | ||
30 | } | ||
31 | |||
32 | database::database(std::string path) | ||
33 | { | ||
34 | // If there is already a file at this path, overwrite it. | ||
35 | if (std::ifstream(path)) | ||
36 | { | ||
37 | if (std::remove(path.c_str())) | ||
38 | { | ||
39 | throw std::logic_error("Could not overwrite file at path"); | ||
40 | } | ||
41 | } | ||
42 | |||
43 | if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
44 | { | ||
45 | // We still have to free the resources allocated. In the event that | ||
46 | // allocation failed, ppdb will be null and sqlite3_close_v2 will just | ||
47 | // ignore it. | ||
48 | std::string errmsg(sqlite3_errmsg(ppdb_)); | ||
49 | sqlite3_close_v2(ppdb_); | ||
50 | |||
51 | throw sqlite3_error("Could not create output datafile", errmsg); | ||
52 | } | ||
53 | } | ||
54 | |||
55 | database::database(database&& other) : database() | ||
56 | { | ||
57 | swap(*this, other); | ||
58 | } | ||
59 | |||
60 | database& database::operator=(database&& other) | ||
61 | { | ||
62 | swap(*this, other); | ||
63 | |||
64 | return *this; | ||
65 | } | ||
66 | |||
67 | void swap(database& first, database& second) | ||
68 | { | ||
69 | std::swap(first.ppdb_, second.ppdb_); | ||
70 | } | ||
71 | |||
72 | database::~database() | ||
73 | { | ||
74 | sqlite3_close_v2(ppdb_); | ||
75 | } | ||
76 | |||
77 | void database::runQuery(std::string query) | ||
78 | { | ||
79 | // This can only happen when doing bad things with move semantics. | ||
80 | assert(ppdb_ != nullptr); | ||
81 | |||
82 | sqlite3_stmt* ppstmt; | ||
83 | |||
84 | if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
85 | { | ||
86 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
87 | } | ||
88 | |||
89 | int result = sqlite3_step(ppstmt); | ||
90 | sqlite3_finalize(ppstmt); | ||
91 | |||
92 | if (result != SQLITE_DONE) | ||
93 | { | ||
94 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
95 | } | ||
96 | } | ||
97 | |||
98 | void database::insertIntoTable(std::string table, std::list<field> fields) | ||
99 | { | ||
100 | // This can only happen when doing bad things with move semantics. | ||
101 | assert(ppdb_ != nullptr); | ||
102 | |||
103 | // This shouldn't happen. | ||
104 | assert(!fields.empty()); | ||
105 | |||
106 | std::list<std::string> fieldNames; | ||
107 | std::list<std::string> qs; | ||
108 | for (field& f : fields) | ||
109 | { | ||
110 | fieldNames.push_back(f.getName()); | ||
111 | qs.push_back("?"); | ||
112 | } | ||
113 | |||
114 | std::ostringstream query; | ||
115 | query << "INSERT INTO "; | ||
116 | query << table; | ||
117 | query << " ("; | ||
118 | query << implode(std::begin(fieldNames), std::end(fieldNames), ", "); | ||
119 | query << ") VALUES ("; | ||
120 | query << implode(std::begin(qs), std::end(qs), ", "); | ||
121 | query << ")"; | ||
122 | |||
123 | std::string query_str = query.str(); | ||
124 | |||
125 | sqlite3_stmt* ppstmt; | ||
126 | |||
127 | if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK) | ||
128 | { | ||
129 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
130 | } | ||
131 | |||
132 | int i = 1; | ||
133 | for (field& f : fields) | ||
134 | { | ||
135 | switch (f.getType()) | ||
136 | { | ||
137 | case field::type::integer: | ||
138 | { | ||
139 | sqlite3_bind_int(ppstmt, i, f.getInteger()); | ||
140 | |||
141 | break; | ||
142 | } | ||
143 | |||
144 | case field::type::string: | ||
145 | { | ||
146 | sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT); | ||
147 | |||
148 | break; | ||
149 | } | ||
150 | |||
151 | case field::type::invalid: | ||
152 | { | ||
153 | // Fields can only be invalid when doing bad things with move semantics. | ||
154 | assert(false); | ||
155 | |||
156 | break; | ||
157 | } | ||
158 | } | ||
159 | |||
160 | i++; | ||
161 | } | ||
162 | |||
163 | int result = sqlite3_step(ppstmt); | ||
164 | sqlite3_finalize(ppstmt); | ||
165 | |||
166 | if (result != SQLITE_DONE) | ||
167 | { | ||
168 | throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_)); | ||
169 | } | ||
170 | } | ||
171 | |||
172 | }; | ||
173 | }; | ||
diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h | |||
@@ -0,0 +1,73 @@ | |||
1 | #ifndef DATABASE_H_0B0A47D2 | ||
2 | #define DATABASE_H_0B0A47D2 | ||
3 | |||
4 | #include <string> | ||
5 | #include <exception> | ||
6 | #include <list> | ||
7 | |||
8 | struct sqlite3; | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | class field; | ||
14 | |||
15 | class sqlite3_error : public std::exception { | ||
16 | public: | ||
17 | |||
18 | sqlite3_error(const std::string& what, const std::string& db_err); | ||
19 | |||
20 | const char* what() const noexcept override; | ||
21 | const char* db_err() const noexcept; | ||
22 | |||
23 | private: | ||
24 | std::string what_; | ||
25 | std::string db_err_; | ||
26 | |||
27 | }; | ||
28 | |||
29 | class database { | ||
30 | public: | ||
31 | |||
32 | // Constructor | ||
33 | |||
34 | explicit database(std::string path); | ||
35 | |||
36 | // Disable copying | ||
37 | |||
38 | database(const database& other) = delete; | ||
39 | database& operator=(const database& other) = delete; | ||
40 | |||
41 | // Move constructor and move assignment | ||
42 | |||
43 | database(database&& other); | ||
44 | database& operator=(database&& other); | ||
45 | |||
46 | // Swap | ||
47 | |||
48 | friend void swap(database& first, database& second); | ||
49 | |||
50 | // Destructor | ||
51 | |||
52 | ~database(); | ||
53 | |||
54 | // Actions | ||
55 | |||
56 | void runQuery(std::string query); | ||
57 | |||
58 | void insertIntoTable(std::string table, std::list<field> fields); | ||
59 | |||
60 | private: | ||
61 | |||
62 | database() | ||
63 | { | ||
64 | } | ||
65 | |||
66 | sqlite3* ppdb_ = nullptr; | ||
67 | |||
68 | }; | ||
69 | |||
70 | }; | ||
71 | }; | ||
72 | |||
73 | #endif /* end of include guard: DATABASE_H_0B0A47D2 */ | ||
diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp | |||
@@ -0,0 +1,193 @@ | |||
1 | #include "field.h" | ||
2 | #include <stdexcept> | ||
3 | #include <utility> | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | field::field(const field& other) | ||
9 | { | ||
10 | type_ = other.type_; | ||
11 | name_ = other.name_; | ||
12 | |||
13 | switch (type_) | ||
14 | { | ||
15 | case type::integer: | ||
16 | { | ||
17 | integer_ = other.integer_; | ||
18 | |||
19 | break; | ||
20 | } | ||
21 | |||
22 | case type::string: | ||
23 | { | ||
24 | new(&string_) std::string(other.string_); | ||
25 | |||
26 | break; | ||
27 | } | ||
28 | |||
29 | case type::invalid: | ||
30 | { | ||
31 | break; | ||
32 | } | ||
33 | } | ||
34 | } | ||
35 | |||
36 | field::field(field&& other) : field() | ||
37 | { | ||
38 | swap(*this, other); | ||
39 | } | ||
40 | |||
41 | field& field::operator=(field other) | ||
42 | { | ||
43 | swap(*this, other); | ||
44 | |||
45 | return *this; | ||
46 | } | ||
47 | |||
48 | void swap(field& first, field& second) | ||
49 | { | ||
50 | using type = field::type; | ||
51 | |||
52 | type tempType = first.type_; | ||
53 | std::string tempName = std::move(first.name_); | ||
54 | int tempInteger; | ||
55 | std::string tempString; | ||
56 | |||
57 | switch (first.type_) | ||
58 | { | ||
59 | case type::integer: | ||
60 | { | ||
61 | tempInteger = first.integer_; | ||
62 | |||
63 | break; | ||
64 | } | ||
65 | |||
66 | case type::string: | ||
67 | { | ||
68 | tempString = std::move(tempString); | ||
69 | |||
70 | break; | ||
71 | } | ||
72 | |||
73 | case type::invalid: | ||
74 | { | ||
75 | break; | ||
76 | } | ||
77 | } | ||
78 | |||
79 | first.~field(); | ||
80 | |||
81 | first.type_ = second.type_; | ||
82 | first.name_ = std::move(second.name_); | ||
83 | |||
84 | switch (second.type_) | ||
85 | { | ||
86 | case type::integer: | ||
87 | { | ||
88 | first.integer_ = second.integer_; | ||
89 | |||
90 | break; | ||
91 | } | ||
92 | |||
93 | case type::string: | ||
94 | { | ||
95 | new(&first.string_) std::string(std::move(second.string_)); | ||
96 | |||
97 | break; | ||
98 | } | ||
99 | |||
100 | case type::invalid: | ||
101 | { | ||
102 | break; | ||
103 | } | ||
104 | } | ||
105 | |||
106 | second.~field(); | ||
107 | |||
108 | second.type_ = tempType; | ||
109 | second.name_ = std::move(tempName); | ||
110 | |||
111 | switch (tempType) | ||
112 | { | ||
113 | case type::integer: | ||
114 | { | ||
115 | second.integer_ = tempInteger; | ||
116 | |||
117 | break; | ||
118 | } | ||
119 | |||
120 | case type::string: | ||
121 | { | ||
122 | new(&second.string_) std::string(std::move(tempString)); | ||
123 | |||
124 | break; | ||
125 | } | ||
126 | |||
127 | case type::invalid: | ||
128 | { | ||
129 | break; | ||
130 | } | ||
131 | } | ||
132 | } | ||
133 | |||
134 | field::~field() | ||
135 | { | ||
136 | switch (type_) | ||
137 | { | ||
138 | case type::string: | ||
139 | { | ||
140 | using string_type = std::string; | ||
141 | string_.~string_type(); | ||
142 | |||
143 | break; | ||
144 | } | ||
145 | |||
146 | case type::integer: | ||
147 | case type::invalid: | ||
148 | { | ||
149 | break; | ||
150 | } | ||
151 | } | ||
152 | } | ||
153 | |||
154 | field::field( | ||
155 | std::string name, | ||
156 | int arg) : | ||
157 | type_(type::integer), | ||
158 | name_(name), | ||
159 | integer_(arg) | ||
160 | { | ||
161 | } | ||
162 | |||
163 | int field::getInteger() const | ||
164 | { | ||
165 | if (type_ != type::integer) | ||
166 | { | ||
167 | throw std::domain_error("field::getInteger called on non-integer field"); | ||
168 | } | ||
169 | |||
170 | return integer_; | ||
171 | } | ||
172 | |||
173 | field::field( | ||
174 | std::string name, | ||
175 | std::string arg) : | ||
176 | type_(type::string), | ||
177 | name_(name) | ||
178 | { | ||
179 | new(&string_) std::string(arg); | ||
180 | } | ||
181 | |||
182 | std::string field::getString() const | ||
183 | { | ||
184 | if (type_ != type::string) | ||
185 | { | ||
186 | throw std::domain_error("field::getString called on non-string field"); | ||
187 | } | ||
188 | |||
189 | return string_; | ||
190 | } | ||
191 | |||
192 | }; | ||
193 | }; | ||
diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h | |||
@@ -0,0 +1,76 @@ | |||
1 | #ifndef BINDING_H_CAE0B18E | ||
2 | #define BINDING_H_CAE0B18E | ||
3 | |||
4 | #include <string> | ||
5 | |||
6 | namespace verbly { | ||
7 | namespace generator { | ||
8 | |||
9 | class field { | ||
10 | public: | ||
11 | enum class type { | ||
12 | invalid, | ||
13 | integer, | ||
14 | string | ||
15 | }; | ||
16 | |||
17 | // Copy and move constructors | ||
18 | |||
19 | field(const field& other); | ||
20 | field(field&& other); | ||
21 | |||
22 | // Assignment | ||
23 | |||
24 | field& operator=(field other); | ||
25 | |||
26 | // Swap | ||
27 | |||
28 | friend void swap(field& first, field& second); | ||
29 | |||
30 | // Destructor | ||
31 | |||
32 | ~field(); | ||
33 | |||
34 | // Generic accessors | ||
35 | |||
36 | type getType() const | ||
37 | { | ||
38 | return type_; | ||
39 | } | ||
40 | |||
41 | std::string getName() const | ||
42 | { | ||
43 | return name_; | ||
44 | } | ||
45 | |||
46 | // Integer | ||
47 | |||
48 | field(std::string name, int arg); | ||
49 | |||
50 | int getInteger() const; | ||
51 | |||
52 | // String | ||
53 | |||
54 | field(std::string name, std::string arg); | ||
55 | |||
56 | std::string getString() const; | ||
57 | |||
58 | private: | ||
59 | |||
60 | field() | ||
61 | { | ||
62 | } | ||
63 | |||
64 | union { | ||
65 | int integer_; | ||
66 | std::string string_; | ||
67 | }; | ||
68 | |||
69 | type type_ = type::invalid; | ||
70 | std::string name_; | ||
71 | }; | ||
72 | |||
73 | }; | ||
74 | }; | ||
75 | |||
76 | #endif /* end of include guard: BINDING_H_CAE0B18E */ | ||
diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp | |||
@@ -0,0 +1,53 @@ | |||
1 | #include "form.h" | ||
2 | #include <algorithm> | ||
3 | #include <list> | ||
4 | #include "database.h" | ||
5 | #include "field.h" | ||
6 | #include "pronunciation.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | int form::nextId_ = 0; | ||
12 | |||
13 | form::form(std::string text) : | ||
14 | id_(nextId_++), | ||
15 | text_(text), | ||
16 | complexity_(std::count(std::begin(text), std::end(text), ' ') + 1), | ||
17 | proper_(std::any_of(std::begin(text), std::end(text), std::isupper)) | ||
18 | { | ||
19 | } | ||
20 | |||
21 | void form::addPronunciation(const pronunciation& p) | ||
22 | { | ||
23 | pronunciations_.insert(&p); | ||
24 | } | ||
25 | |||
26 | database& operator<<(database& db, const form& arg) | ||
27 | { | ||
28 | // Serialize the form first. | ||
29 | { | ||
30 | std::list<field> fields; | ||
31 | fields.emplace_back("form_id", arg.getId()); | ||
32 | fields.emplace_back("form", arg.getText()); | ||
33 | fields.emplace_back("complexity", arg.getComplexity()); | ||
34 | fields.emplace_back("proper", arg.isProper()); | ||
35 | |||
36 | db.insertIntoTable("forms", std::move(fields)); | ||
37 | } | ||
38 | |||
39 | // Then, serialize the form/pronunciation relationship. | ||
40 | for (const pronunciation* p : arg.getPronunciations()) | ||
41 | { | ||
42 | std::list<field> fields; | ||
43 | fields.emplace_back("form_id", arg.getId()); | ||
44 | fields.emplace_back("pronunciation_id", p->getId()); | ||
45 | |||
46 | db.insertIntoTable("forms_pronunciations", std::move(fields)); | ||
47 | } | ||
48 | |||
49 | return db; | ||
50 | } | ||
51 | |||
52 | }; | ||
53 | }; | ||
diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h | |||
@@ -0,0 +1,71 @@ | |||
1 | #ifndef FORM_H_7EFBC970 | ||
2 | #define FORM_H_7EFBC970 | ||
3 | |||
4 | #include <string> | ||
5 | #include <set> | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class pronunciation; | ||
11 | class database; | ||
12 | |||
13 | class form { | ||
14 | public: | ||
15 | |||
16 | // Constructor | ||
17 | |||
18 | explicit form(std::string text); | ||
19 | |||
20 | // Mutators | ||
21 | |||
22 | void addPronunciation(const pronunciation& p); | ||
23 | |||
24 | // Accessors | ||
25 | |||
26 | int getId() const | ||
27 | { | ||
28 | return id_; | ||
29 | } | ||
30 | |||
31 | std::string getText() const | ||
32 | { | ||
33 | return text_; | ||
34 | } | ||
35 | |||
36 | int getComplexity() const | ||
37 | { | ||
38 | return complexity_; | ||
39 | } | ||
40 | |||
41 | bool isProper() const | ||
42 | { | ||
43 | return proper_; | ||
44 | } | ||
45 | |||
46 | std::set<const pronunciation*> getPronunciations() const | ||
47 | { | ||
48 | return pronunciations_; | ||
49 | } | ||
50 | |||
51 | private: | ||
52 | |||
53 | static int nextId_; | ||
54 | |||
55 | const int id_; | ||
56 | const std::string text_; | ||
57 | const int complexity_; | ||
58 | const bool proper_; | ||
59 | |||
60 | std::set<const pronunciation*> pronunciations_; | ||
61 | |||
62 | }; | ||
63 | |||
64 | // Serializer | ||
65 | |||
66 | database& operator<<(database& db, const form& arg); | ||
67 | |||
68 | }; | ||
69 | }; | ||
70 | |||
71 | #endif /* end of include guard: FORM_H_7EFBC970 */ | ||
diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp | |||
@@ -0,0 +1,83 @@ | |||
1 | #include "frame.h" | ||
2 | #include "database.h" | ||
3 | #include "field.h" | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | int frame::nextId_ = 0; | ||
9 | |||
10 | frame::frame() : id_(nextId_++) | ||
11 | { | ||
12 | } | ||
13 | |||
14 | void frame::push_back(part fp) | ||
15 | { | ||
16 | parts_.push_back(std::move(fp)); | ||
17 | } | ||
18 | |||
19 | database& operator<<(database& db, const frame& arg) | ||
20 | { | ||
21 | std::list<field> fields; | ||
22 | fields.emplace_back("frame_id", arg.getId()); | ||
23 | |||
24 | nlohmann::json jsonParts; | ||
25 | for (const part& p : arg) | ||
26 | { | ||
27 | nlohmann::json jsonPart; | ||
28 | jsonPart["type"] = static_cast<int>(p.getType()); | ||
29 | |||
30 | switch (p.getType()) | ||
31 | { | ||
32 | case part::type::noun_phrase: | ||
33 | { | ||
34 | jsonPart["role"] = p.getNounRole(); | ||
35 | jsonPart["selrestrs"] = p.getNounSelrestrs().toJson(); | ||
36 | jsonPart["synrestrs"] = p.getNounSynrestrs(); | ||
37 | |||
38 | break; | ||
39 | } | ||
40 | |||
41 | case part::type::preposition: | ||
42 | { | ||
43 | jsonPart["choices"] = p.getPrepositionChoices(); | ||
44 | jsonPart["literal"] = p.isPrepositionLiteral(); | ||
45 | |||
46 | break; | ||
47 | } | ||
48 | |||
49 | case part::type::literal: | ||
50 | { | ||
51 | jsonPart["value"] = p.getLiteralValue(); | ||
52 | |||
53 | break; | ||
54 | } | ||
55 | |||
56 | case part::type::verb: | ||
57 | case part::type::adjective: | ||
58 | case part::type::adverb: | ||
59 | { | ||
60 | break; | ||
61 | } | ||
62 | |||
63 | case part::type::invalid: | ||
64 | { | ||
65 | // Invalid parts should not be serialized. | ||
66 | assert(false); | ||
67 | |||
68 | break; | ||
69 | } | ||
70 | } | ||
71 | |||
72 | jsonParts.emplace_back(std::move(jsonPart)); | ||
73 | } | ||
74 | |||
75 | fields.emplace_back("data", jsonParts.dump()); | ||
76 | |||
77 | db.insertIntoTable("frames", std::move(fields)); | ||
78 | |||
79 | return db; | ||
80 | } | ||
81 | |||
82 | }; | ||
83 | }; | ||
diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h | |||
@@ -0,0 +1,59 @@ | |||
1 | #ifndef FRAME_H_26770FF1 | ||
2 | #define FRAME_H_26770FF1 | ||
3 | |||
4 | #include <list> | ||
5 | #include "part.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class database; | ||
11 | |||
12 | class frame { | ||
13 | public: | ||
14 | |||
15 | // Aliases | ||
16 | |||
17 | using const_iterator = std::list<part>::const_iterator; | ||
18 | |||
19 | // Constructor | ||
20 | |||
21 | frame(); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void push_back(part fp); | ||
26 | |||
27 | // Accessors | ||
28 | |||
29 | int getId() const | ||
30 | { | ||
31 | return id_; | ||
32 | } | ||
33 | |||
34 | const_iterator begin() const | ||
35 | { | ||
36 | return std::begin(parts_); | ||
37 | } | ||
38 | |||
39 | const_iterator end() const | ||
40 | { | ||
41 | return std::end(parts_); | ||
42 | } | ||
43 | |||
44 | private: | ||
45 | |||
46 | static int nextId_; | ||
47 | |||
48 | const int id_; | ||
49 | |||
50 | std::list<part> parts_; | ||
51 | |||
52 | }; | ||
53 | |||
54 | database& operator<<(database& db, const frame& arg); | ||
55 | |||
56 | }; | ||
57 | }; | ||
58 | |||
59 | #endif /* end of include guard: FRAME_H_26770FF1 */ | ||
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,2320 +1,1477 @@ | |||
1 | #include <libxml/parser.h> | 1 | #include "generator.h" |
2 | #include <cassert> | ||
3 | #include <stdexcept> | ||
2 | #include <iostream> | 4 | #include <iostream> |
5 | #include <regex> | ||
3 | #include <dirent.h> | 6 | #include <dirent.h> |
4 | #include <set> | ||
5 | #include <map> | ||
6 | #include <string> | ||
7 | #include <vector> | ||
8 | #include <fstream> | 7 | #include <fstream> |
9 | #include <sqlite3.h> | 8 | #include "enums.h" |
10 | #include <sstream> | ||
11 | #include <regex> | ||
12 | #include <list> | ||
13 | #include <algorithm> | ||
14 | #include <json.hpp> | ||
15 | #include "progress.h" | 9 | #include "progress.h" |
10 | #include "selrestr.h" | ||
11 | #include "role.h" | ||
12 | #include "part.h" | ||
13 | #include "field.h" | ||
16 | #include "../lib/util.h" | 14 | #include "../lib/util.h" |
17 | 15 | ||
18 | using json = nlohmann::json; | 16 | namespace verbly { |
19 | 17 | namespace generator { | |
20 | struct verb_t { | ||
21 | std::string infinitive; | ||
22 | std::string past_tense; | ||
23 | std::string past_participle; | ||
24 | std::string ing_form; | ||
25 | std::string s_form; | ||
26 | int id; | ||
27 | }; | ||
28 | |||
29 | struct adjective_t { | ||
30 | std::string base; | ||
31 | std::string comparative; | ||
32 | std::string superlative; | ||
33 | }; | ||
34 | |||
35 | struct noun_t { | ||
36 | std::string singular; | ||
37 | std::string plural; | ||
38 | }; | ||
39 | |||
40 | struct selrestr_t { | ||
41 | enum class type_t { | ||
42 | singleton, | ||
43 | andlogic, | ||
44 | orlogic, | ||
45 | empty | ||
46 | }; | ||
47 | type_t type; | ||
48 | std::string restriction; | ||
49 | bool pos; | ||
50 | std::list<selrestr_t> subordinates; | ||
51 | }; | ||
52 | |||
53 | struct framepart_t { | ||
54 | enum class type_t { | ||
55 | np, | ||
56 | v, | ||
57 | pp, | ||
58 | adj, | ||
59 | adv, | ||
60 | lex | ||
61 | }; | ||
62 | type_t type; | ||
63 | std::string role; | ||
64 | selrestr_t selrestrs; | ||
65 | std::set<std::string> preprestrs; | ||
66 | std::set<std::string> synrestrs; | ||
67 | std::list<std::string> choices; | ||
68 | std::string lexval; | ||
69 | }; | ||
70 | |||
71 | struct group_t { | ||
72 | std::string id; | ||
73 | std::string parent; | ||
74 | std::set<std::string> members; | ||
75 | std::map<std::string, selrestr_t> roles; | ||
76 | std::list<std::list<framepart_t>> frames; | ||
77 | }; | ||
78 | |||
79 | struct pronunciation_t { | ||
80 | std::string phonemes; | ||
81 | std::string prerhyme; | ||
82 | std::string rhyme; | ||
83 | int syllables = 0; | ||
84 | std::string stress; | ||
85 | |||
86 | bool operator<(const pronunciation_t& other) const | ||
87 | { | ||
88 | return phonemes < other.phonemes; | ||
89 | } | ||
90 | }; | ||
91 | |||
92 | std::map<std::string, group_t> groups; | ||
93 | std::map<std::string, verb_t> verbs; | ||
94 | std::map<std::string, adjective_t> adjectives; | ||
95 | std::map<std::string, noun_t> nouns; | ||
96 | std::map<int, std::map<int, int>> wn; | ||
97 | std::map<int, int> images; | ||
98 | std::map<std::string, std::set<pronunciation_t>> pronunciations; | ||
99 | |||
100 | void print_usage() | ||
101 | { | ||
102 | std::cout << "Verbly Datafile Generator" << std::endl; | ||
103 | std::cout << "-------------------------" << std::endl; | ||
104 | std::cout << "Requires exactly six arguments." << std::endl; | ||
105 | std::cout << "1. The path to a VerbNet data directory." << std::endl; | ||
106 | std::cout << "2. The path to an AGID infl.txt file." << std::endl; | ||
107 | std::cout << "3. The path to a WordNet prolog data directory." << std::endl; | ||
108 | std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl; | ||
109 | std::cout << "5. The path to an ImageNet urls.txt file." << std::endl; | ||
110 | std::cout << "6. Datafile output path." << std::endl; | ||
111 | |||
112 | exit(1); | ||
113 | } | ||
114 | |||
115 | void db_error(sqlite3* ppdb, std::string query) | ||
116 | { | ||
117 | std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl; | ||
118 | std::cout << query << std::endl; | ||
119 | sqlite3_close_v2(ppdb); | ||
120 | print_usage(); | ||
121 | } | ||
122 | |||
123 | json export_selrestrs(selrestr_t r) | ||
124 | { | ||
125 | if (r.type == selrestr_t::type_t::empty) | ||
126 | { | ||
127 | return {}; | ||
128 | } else if (r.type == selrestr_t::type_t::singleton) | ||
129 | { | ||
130 | json result; | ||
131 | result["type"] = r.restriction; | ||
132 | result["pos"] = r.pos; | ||
133 | return result; | ||
134 | } else { | ||
135 | json result; | ||
136 | if (r.type == selrestr_t::type_t::andlogic) | ||
137 | { | ||
138 | result["logic"] = "and"; | ||
139 | } else { | ||
140 | result["logic"] = "or"; | ||
141 | } | ||
142 | |||
143 | std::list<json> outlist; | ||
144 | std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs); | ||
145 | result["children"] = outlist; | ||
146 | 18 | ||
147 | return result; | 19 | generator::generator( |
148 | } | 20 | std::string verbNetPath, |
149 | } | 21 | std::string agidPath, |
150 | 22 | std::string wordNetPath, | |
151 | selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) | 23 | std::string cmudictPath, |
152 | { | 24 | std::string imageNetPath, |
153 | selrestr_t r; | 25 | std::string outputPath) : |
154 | xmlChar* key; | 26 | verbNetPath_(verbNetPath), |
155 | 27 | agidPath_(agidPath), | |
156 | if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) | 28 | wordNetPath_(wordNetPath), |
157 | { | 29 | cmudictPath_(cmudictPath), |
158 | if (xmlChildElementCount(top) == 0) | 30 | imageNetPath_(imageNetPath), |
31 | db_(outputPath) | ||
159 | { | 32 | { |
160 | r.type = selrestr_t::type_t::empty; | 33 | // Ensure VerbNet directory exists |
161 | } else if (xmlChildElementCount(top) == 1) | 34 | DIR* dir; |
162 | { | 35 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
163 | r = parse_selrestrs(xmlFirstElementChild(top), filename); | ||
164 | } else { | ||
165 | r.type = selrestr_t::type_t::andlogic; | ||
166 | |||
167 | if (xmlHasProp(top, (const xmlChar*) "logic")) | ||
168 | { | 36 | { |
169 | key = xmlGetProp(top, (const xmlChar*) "logic"); | 37 | throw std::invalid_argument("Invalid VerbNet data directory"); |
170 | if (!xmlStrcmp(key, (const xmlChar*) "or")) | ||
171 | { | ||
172 | r.type = selrestr_t::type_t::orlogic; | ||
173 | } | ||
174 | xmlFree(key); | ||
175 | } | 38 | } |
176 | 39 | ||
177 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | 40 | closedir(dir); |
41 | |||
42 | // Ensure AGID infl.txt exists | ||
43 | if (!std::ifstream(agidPath_)) | ||
178 | { | 44 | { |
179 | if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) | 45 | throw std::invalid_argument("AGID infl.txt file not found"); |
180 | { | ||
181 | r.subordinates.push_back(parse_selrestrs(selrestr, filename)); | ||
182 | } | ||
183 | } | 46 | } |
184 | } | 47 | |
185 | } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) | 48 | // Add directory separator to WordNet path |
186 | { | 49 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) |
187 | r.type = selrestr_t::type_t::singleton; | ||
188 | |||
189 | key = xmlGetProp(top, (xmlChar*) "Value"); | ||
190 | r.pos = (std::string((const char*)key) == "+"); | ||
191 | xmlFree(key); | ||
192 | |||
193 | key = xmlGetProp(top, (xmlChar*) "type"); | ||
194 | r.restriction = (const char*) key; | ||
195 | xmlFree(key); | ||
196 | } else { | ||
197 | // Invalid | ||
198 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
199 | print_usage(); | ||
200 | } | ||
201 | |||
202 | return r; | ||
203 | } | ||
204 | |||
205 | group_t& parse_group(xmlNodePtr top, std::string filename) | ||
206 | { | ||
207 | xmlChar* key = xmlGetProp(top, (xmlChar*) "ID"); | ||
208 | if (key == 0) | ||
209 | { | ||
210 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
211 | print_usage(); | ||
212 | } | ||
213 | std::string vnid = (const char*)key; | ||
214 | vnid = vnid.substr(vnid.find_first_of("-")+1); | ||
215 | xmlFree(key); | ||
216 | |||
217 | group_t g; | ||
218 | g.id = vnid; | ||
219 | |||
220 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
221 | { | ||
222 | if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES")) | ||
223 | { | ||
224 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) | ||
225 | { | 50 | { |
226 | if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) | 51 | wordNetPath_ += '/'; |
227 | { | ||
228 | auto& sg = parse_group(subclass, filename); | ||
229 | sg.parent = vnid; | ||
230 | |||
231 | for (auto member : sg.members) | ||
232 | { | ||
233 | g.members.insert(member); | ||
234 | } | ||
235 | |||
236 | // The schema requires that subclasses appear after role definitions, so we can do this now | ||
237 | for (auto role : g.roles) | ||
238 | { | ||
239 | if (sg.roles.count(role.first) == 0) | ||
240 | { | ||
241 | sg.roles[role.first] = role.second; | ||
242 | } | ||
243 | } | ||
244 | } | ||
245 | } | 52 | } |
246 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) | 53 | |
247 | { | 54 | // Ensure WordNet tables exist |
248 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) | 55 | for (std::string table : { |
56 | "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax" | ||
57 | }) | ||
249 | { | 58 | { |
250 | if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) | 59 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) |
251 | { | 60 | { |
252 | key = xmlGetProp(member, (xmlChar*) "name"); | 61 | throw std::invalid_argument("WordNet " + table + " table not found"); |
253 | g.members.insert((const char*)key); | ||
254 | xmlFree(key); | ||
255 | } | 62 | } |
256 | } | 63 | } |
257 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) | 64 | |
258 | { | 65 | // Ensure CMUDICT file exists |
259 | for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) | 66 | if (!std::ifstream(cmudictPath_)) |
260 | { | 67 | { |
261 | if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) | 68 | throw std::invalid_argument("CMUDICT file not found"); |
262 | { | ||
263 | selrestr_t r; | ||
264 | r.type = selrestr_t::type_t::empty; | ||
265 | |||
266 | key = xmlGetProp(role, (const xmlChar*) "type"); | ||
267 | std::string type = (const char*)key; | ||
268 | xmlFree(key); | ||
269 | |||
270 | for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
271 | { | ||
272 | if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS")) | ||
273 | { | ||
274 | r = parse_selrestrs(rolenode, filename); | ||
275 | } | ||
276 | } | ||
277 | |||
278 | g.roles[type] = r; | ||
279 | } | ||
280 | } | 69 | } |
281 | } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) | 70 | |
282 | { | 71 | // Ensure ImageNet urls.txt exists |
283 | for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) | 72 | if (!std::ifstream(imageNetPath_)) |
284 | { | 73 | { |
285 | if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) | 74 | throw std::invalid_argument("ImageNet urls.txt file not found"); |
286 | { | ||
287 | std::list<framepart_t> f; | ||
288 | |||
289 | for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) | ||
290 | { | ||
291 | if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX")) | ||
292 | { | ||
293 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
294 | { | ||
295 | framepart_t fp; | ||
296 | |||
297 | if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP")) | ||
298 | { | ||
299 | fp.type = framepart_t::type_t::np; | ||
300 | |||
301 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
302 | fp.role = (const char*)key; | ||
303 | xmlFree(key); | ||
304 | |||
305 | fp.selrestrs.type = selrestr_t::type_t::empty; | ||
306 | |||
307 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
308 | { | ||
309 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS")) | ||
310 | { | ||
311 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
312 | { | ||
313 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR")) | ||
314 | { | ||
315 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
316 | fp.synrestrs.insert(std::string((const char*)key)); | ||
317 | xmlFree(key); | ||
318 | } | ||
319 | } | ||
320 | } | ||
321 | |||
322 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
323 | { | ||
324 | fp.selrestrs = parse_selrestrs(npnode, filename); | ||
325 | } | ||
326 | } | ||
327 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB")) | ||
328 | { | ||
329 | fp.type = framepart_t::type_t::v; | ||
330 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP")) | ||
331 | { | ||
332 | fp.type = framepart_t::type_t::pp; | ||
333 | |||
334 | if (xmlHasProp(syntaxnode, (xmlChar*) "value")) | ||
335 | { | ||
336 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
337 | std::string choices = (const char*)key; | ||
338 | xmlFree(key); | ||
339 | |||
340 | fp.choices = verbly::split<std::list<std::string>>(choices, " "); | ||
341 | } | ||
342 | |||
343 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
344 | { | ||
345 | if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS")) | ||
346 | { | ||
347 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
348 | { | ||
349 | if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR")) | ||
350 | { | ||
351 | key = xmlGetProp(synrestr, (xmlChar*) "type"); | ||
352 | fp.preprestrs.insert(std::string((const char*)key)); | ||
353 | xmlFree(key); | ||
354 | } | ||
355 | } | ||
356 | } | ||
357 | } | ||
358 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ")) | ||
359 | { | ||
360 | fp.type = framepart_t::type_t::adj; | ||
361 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV")) | ||
362 | { | ||
363 | fp.type = framepart_t::type_t::adv; | ||
364 | } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX")) | ||
365 | { | ||
366 | fp.type = framepart_t::type_t::lex; | ||
367 | |||
368 | key = xmlGetProp(syntaxnode, (xmlChar*) "value"); | ||
369 | fp.lexval = (const char*)key; | ||
370 | xmlFree(key); | ||
371 | } else { | ||
372 | continue; | ||
373 | } | ||
374 | |||
375 | f.push_back(fp); | ||
376 | } | ||
377 | |||
378 | g.frames.push_back(f); | ||
379 | } | ||
380 | } | ||
381 | } | ||
382 | } | 75 | } |
383 | } | 76 | } |
384 | } | ||
385 | |||
386 | groups[vnid] = g; | ||
387 | |||
388 | return groups[vnid]; | ||
389 | } | ||
390 | |||
391 | int main(int argc, char** argv) | ||
392 | { | ||
393 | if (argc != 7) | ||
394 | { | ||
395 | print_usage(); | ||
396 | } | ||
397 | |||
398 | // VerbNet data | ||
399 | std::cout << "Reading verb frames..." << std::endl; | ||
400 | |||
401 | DIR* dir; | ||
402 | if ((dir = opendir(argv[1])) == nullptr) | ||
403 | { | ||
404 | std::cout << "Invalid VerbNet data directory." << std::endl; | ||
405 | |||
406 | print_usage(); | ||
407 | } | ||
408 | |||
409 | struct dirent* ent; | ||
410 | while ((ent = readdir(dir)) != nullptr) | ||
411 | { | ||
412 | std::string filename(argv[1]); | ||
413 | if (filename.back() != '/') | ||
414 | { | ||
415 | filename += '/'; | ||
416 | } | ||
417 | 77 | ||
418 | filename += ent->d_name; | 78 | void generator::run() |
419 | //std::cout << ent->d_name << std::endl; | ||
420 | |||
421 | if (filename.rfind(".xml") != filename.size() - 4) | ||
422 | { | ||
423 | continue; | ||
424 | } | ||
425 | |||
426 | xmlDocPtr doc = xmlParseFile(filename.c_str()); | ||
427 | if (doc == nullptr) | ||
428 | { | ||
429 | std::cout << "Error opening " << filename << std::endl; | ||
430 | print_usage(); | ||
431 | } | ||
432 | |||
433 | xmlNodePtr top = xmlDocGetRootElement(doc); | ||
434 | if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS"))) | ||
435 | { | ||
436 | std::cout << "Bad VerbNet file format: " << filename << std::endl; | ||
437 | print_usage(); | ||
438 | } | ||
439 | |||
440 | parse_group(top, filename); | ||
441 | } | ||
442 | |||
443 | closedir(dir); | ||
444 | |||
445 | // Get verbs from AGID | ||
446 | std::cout << "Reading inflections..." << std::endl; | ||
447 | |||
448 | std::ifstream agidfile(argv[2]); | ||
449 | if (!agidfile.is_open()) | ||
450 | { | ||
451 | std::cout << "Could not open AGID file: " << argv[2] << std::endl; | ||
452 | print_usage(); | ||
453 | } | ||
454 | |||
455 | for (;;) | ||
456 | { | ||
457 | std::string line; | ||
458 | if (!getline(agidfile, line)) | ||
459 | { | ||
460 | break; | ||
461 | } | ||
462 | |||
463 | if (line.back() == '\r') | ||
464 | { | 79 | { |
465 | line.pop_back(); | 80 | // Create notions, words, lemmas, and forms from WordNet synsets |
466 | } | 81 | readWordNetSynsets(); |
467 | 82 | ||
468 | int divider = line.find_first_of(" "); | 83 | // Reads adjective positioning WordNet data |
469 | std::string word = line.substr(0, divider); | 84 | readAdjectivePositioning(); |
470 | line = line.substr(divider+1); | 85 | |
471 | char type = line[0]; | 86 | // Counts the number of URLs ImageNet has per notion |
472 | 87 | readImageNetUrls(); | |
473 | if (line[1] == '?') | 88 | |
474 | { | 89 | // Creates a word by WordNet sense key lookup table |
475 | line.erase(0, 4); | 90 | readWordNetSenseKeys(); |
476 | } else { | 91 | |
477 | line.erase(0, 3); | 92 | // Creates groups and frames from VerbNet data |
478 | } | 93 | readVerbNet(); |
479 | 94 | ||
480 | std::vector<std::string> forms; | 95 | // Creates forms and inflections from AGID. To reduce the amount of forms |
481 | while (!line.empty()) | 96 | // created, we do this after most lemmas that need inflecting have been |
482 | { | 97 | // created through other means, and then only generate forms for |
483 | std::string inflection; | 98 | // inflections of already-existing lemmas. The exception to this regards |
484 | if ((divider = line.find(" | ")) != std::string::npos) | 99 | // verb lemmas. If a verb lemma in AGID either does not exist yet, or does |
485 | { | 100 | // exist but is not related to any words that are related to verb notions, |
486 | inflection = line.substr(0, divider); | 101 | // then a notion and a word is generated and the form generation proceeds |
487 | line = line.substr(divider + 3); | 102 | // as usual. |
488 | } else { | 103 | readAgidInflections(); |
489 | inflection = line; | 104 | |
490 | line = ""; | 105 | // Reads in prepositions and the is_a relationship |
491 | } | 106 | readPrepositions(); |
492 | 107 | ||
493 | if ((divider = inflection.find_first_of(",?")) != std::string::npos) | 108 | // Creates pronunciations from CMUDICT. To reduce the amount of |
494 | { | 109 | // pronunciations created, we do this after all forms have been created, |
495 | inflection = inflection.substr(0, divider); | 110 | // and then only generate pronunciations for already-exisiting forms. |
496 | } | 111 | readCmudictPronunciations(); |
497 | 112 | ||
498 | forms.push_back(inflection); | 113 | // Writes the database schema |
114 | writeSchema(); | ||
115 | |||
116 | // Dumps data to the database | ||
117 | dumpObjects(); | ||
118 | |||
119 | // Populates the antonymy relationship from WordNet | ||
120 | readWordNetAntonymy(); | ||
121 | |||
122 | // Populates the variation relationship from WordNet | ||
123 | readWordNetVariation(); | ||
124 | |||
125 | // Populates the usage, topicality, and regionality relationships from | ||
126 | // WordNet | ||
127 | readWordNetClasses(); | ||
128 | |||
129 | // Populates the causality relationship from WordNet | ||
130 | readWordNetCausality(); | ||
131 | |||
132 | // Populates the entailment relationship from WordNet | ||
133 | readWordNetEntailment(); | ||
134 | |||
135 | // Populates the hypernymy relationship from WordNet | ||
136 | readWordNetHypernymy(); | ||
137 | |||
138 | // Populates the instantiation relationship from WordNet | ||
139 | readWordNetInstantiation(); | ||
140 | |||
141 | // Populates the member meronymy relationship from WordNet | ||
142 | readWordNetMemberMeronymy(); | ||
143 | |||
144 | // Populates the part meronymy relationship from WordNet | ||
145 | readWordNetPartMeronymy(); | ||
146 | |||
147 | // Populates the substance meronymy relationship from WordNet | ||
148 | readWordNetSubstanceMeronymy(); | ||
149 | |||
150 | // Populates the pertainymy and mannernymy relationships from WordNet | ||
151 | readWordNetPertainymy(); | ||
152 | |||
153 | // Populates the specification relationship from WordNet | ||
154 | readWordNetSpecification(); | ||
155 | |||
156 | // Populates the adjective similarity relationship from WordNet | ||
157 | readWordNetSimilarity(); | ||
158 | |||
159 | |||
160 | |||
161 | |||
162 | |||
163 | |||
164 | |||
165 | |||
499 | } | 166 | } |
500 | 167 | ||
501 | switch (type) | 168 | void generator::readWordNetSynsets() |
502 | { | 169 | { |
503 | case 'V': | 170 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
171 | progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
172 | |||
173 | for (std::string line : lines) | ||
504 | { | 174 | { |
505 | verb_t v; | 175 | ppgs.update(); |
506 | v.infinitive = word; | 176 | |
507 | if (forms.size() == 4) | 177 | std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); |
508 | { | 178 | std::smatch relation_data; |
509 | v.past_tense = forms[0]; | 179 | if (!std::regex_search(line, relation_data, relation)) |
510 | v.past_participle = forms[1]; | 180 | { |
511 | v.ing_form = forms[2]; | 181 | continue; |
512 | v.s_form = forms[3]; | ||
513 | } else if (forms.size() == 3) | ||
514 | { | ||
515 | v.past_tense = forms[0]; | ||
516 | v.past_participle = forms[0]; | ||
517 | v.ing_form = forms[1]; | ||
518 | v.s_form = forms[2]; | ||
519 | } else if (forms.size() == 8) | ||
520 | { | ||
521 | // As of AGID 2014.08.11, this is only "to be" | ||
522 | v.past_tense = forms[0]; | ||
523 | v.past_participle = forms[2]; | ||
524 | v.ing_form = forms[3]; | ||
525 | v.s_form = forms[4]; | ||
526 | } else { | ||
527 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
528 | // - may and shall do not conjugate the way we want them to | ||
529 | // - methinks only has a past tense and is an outlier | ||
530 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
531 | std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
532 | } | 182 | } |
533 | 183 | ||
534 | verbs[word] = v; | 184 | int synset_id = std::stoi(relation_data[1]); |
535 | 185 | int wnum = std::stoi(relation_data[2]); | |
536 | break; | 186 | std::string text = relation_data[3]; |
537 | } | 187 | int tag_count = std::stoi(relation_data[4]); |
538 | 188 | size_t word_it; | |
539 | case 'A': | 189 | while ((word_it = text.find("''")) != std::string::npos) |
540 | { | ||
541 | adjective_t adj; | ||
542 | adj.base = word; | ||
543 | if (forms.size() == 2) | ||
544 | { | 190 | { |
545 | adj.comparative = forms[0]; | 191 | text.erase(word_it, 1); |
546 | adj.superlative = forms[1]; | ||
547 | } else { | ||
548 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | ||
549 | std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl; | ||
550 | } | 192 | } |
551 | 193 | ||
552 | adjectives[word] = adj; | 194 | // The WordNet data does contain duplicates, so we need to check that we |
553 | 195 | // haven't already created this word. | |
554 | break; | 196 | std::pair<int, int> lookup(synset_id, wnum); |
555 | } | 197 | if (!wordByWnidAndWnum_.count(lookup)) |
556 | |||
557 | case 'N': | ||
558 | { | ||
559 | noun_t n; | ||
560 | n.singular = word; | ||
561 | if (forms.size() == 1) | ||
562 | { | 198 | { |
563 | n.plural = forms[0]; | 199 | notion& synset = lookupOrCreateNotion(synset_id); |
564 | } else { | 200 | lemma& lex = lookupOrCreateLemma(text); |
565 | // As of AGID 2014.08.11, this is non-existent. | 201 | word& entry = createWord(synset, lex, tag_count); |
566 | std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; | 202 | |
203 | wordByWnidAndWnum_[lookup] = &entry; | ||
567 | } | 204 | } |
568 | |||
569 | nouns[word] = n; | ||
570 | |||
571 | break; | ||
572 | } | 205 | } |
573 | } | 206 | } |
574 | } | ||
575 | |||
576 | // Pronounciations | ||
577 | std::cout << "Reading pronunciations..." << std::endl; | ||
578 | |||
579 | std::ifstream pronfile(argv[4]); | ||
580 | if (!pronfile.is_open()) | ||
581 | { | ||
582 | std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl; | ||
583 | print_usage(); | ||
584 | } | ||
585 | |||
586 | for (;;) | ||
587 | { | ||
588 | std::string line; | ||
589 | if (!getline(pronfile, line)) | ||
590 | { | ||
591 | break; | ||
592 | } | ||
593 | |||
594 | if (line.back() == '\r') | ||
595 | { | ||
596 | line.pop_back(); | ||
597 | } | ||
598 | 207 | ||
599 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | 208 | void generator::readAdjectivePositioning() |
600 | std::smatch phoneme_data; | ||
601 | if (std::regex_search(line, phoneme_data, phoneme)) | ||
602 | { | 209 | { |
603 | std::string canonical(phoneme_data[1]); | 210 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl")); |
604 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | 211 | progress ppgs("Reading adjective positionings from WordNet...", lines.size()); |
605 | |||
606 | std::string phonemes = phoneme_data[2]; | ||
607 | auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " "); | ||
608 | auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) { | ||
609 | return phoneme.find("1") != std::string::npos; | ||
610 | }); | ||
611 | 212 | ||
612 | pronunciation_t p; | 213 | for (std::string line : lines) |
613 | p.phonemes = phonemes; | ||
614 | |||
615 | // Rhyme detection | ||
616 | if (phemstrt != std::end(phoneme_set)) | ||
617 | { | 214 | { |
618 | std::stringstream rhymer; | 215 | ppgs.update(); |
619 | for (auto it = phemstrt; it != std::end(phoneme_set); it++) | ||
620 | { | ||
621 | std::string naked; | ||
622 | std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) { | ||
623 | return isdigit(ch); | ||
624 | }); | ||
625 | |||
626 | if (it != phemstrt) | ||
627 | { | ||
628 | rhymer << " "; | ||
629 | } | ||
630 | |||
631 | rhymer << naked; | ||
632 | } | ||
633 | 216 | ||
634 | p.rhyme = rhymer.str(); | 217 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); |
635 | 218 | std::smatch relation_data; | |
636 | if (phemstrt != std::begin(phoneme_set)) | 219 | if (!std::regex_search(line, relation_data, relation)) |
637 | { | 220 | { |
638 | phemstrt--; | 221 | continue; |
639 | p.prerhyme = *phemstrt; | ||
640 | } else { | ||
641 | p.prerhyme = ""; | ||
642 | } | 222 | } |
643 | } else { | ||
644 | p.prerhyme = ""; | ||
645 | p.rhyme = ""; | ||
646 | } | ||
647 | 223 | ||
648 | // Syllable/stress | 224 | int synset_id = stoi(relation_data[1]); |
649 | for (auto phm : phoneme_set) | 225 | int wnum = stoi(relation_data[2]); |
650 | { | 226 | std::string adjpos_str = relation_data[3]; |
651 | if (isdigit(phm.back())) | ||
652 | { | ||
653 | // It's a vowel! | ||
654 | p.syllables++; | ||
655 | 227 | ||
656 | if (phm.back() == '1') | 228 | std::pair<int, int> lookup(synset_id, wnum); |
229 | if (wordByWnidAndWnum_.count(lookup)) | ||
230 | { | ||
231 | word& adj = *wordByWnidAndWnum_.at(lookup); | ||
232 | |||
233 | if (adjpos_str == "p") | ||
234 | { | ||
235 | adj.setAdjectivePosition(positioning::predicate); | ||
236 | } else if (adjpos_str == "a") | ||
237 | { | ||
238 | adj.setAdjectivePosition(positioning::attributive); | ||
239 | } else if (adjpos_str == "i") | ||
657 | { | 240 | { |
658 | p.stress.push_back('1'); | 241 | adj.setAdjectivePosition(positioning::postnominal); |
659 | } else { | 242 | } else { |
660 | p.stress.push_back('0'); | 243 | // Can't happen because of how we specified the regex. |
244 | assert(false); | ||
661 | } | 245 | } |
662 | } | 246 | } |
663 | } | 247 | } |
664 | |||
665 | pronunciations[canonical].insert(p); | ||
666 | } | ||
667 | } | ||
668 | |||
669 | // Images | ||
670 | std::cout << "Reading images..." << std::endl; | ||
671 | |||
672 | std::ifstream imagefile(argv[5]); | ||
673 | if (!imagefile.is_open()) | ||
674 | { | ||
675 | std::cout << "Could not open ImageNet file: " << argv[5] << std::endl; | ||
676 | print_usage(); | ||
677 | } | ||
678 | |||
679 | for (;;) | ||
680 | { | ||
681 | std::string line; | ||
682 | if (!getline(imagefile, line)) | ||
683 | { | ||
684 | break; | ||
685 | } | ||
686 | |||
687 | if (line.back() == '\r') | ||
688 | { | ||
689 | line.pop_back(); | ||
690 | } | ||
691 | |||
692 | std::string wnid_s = line.substr(1, 8); | ||
693 | int wnid = stoi(wnid_s) + 100000000; | ||
694 | images[wnid]++; | ||
695 | } | ||
696 | |||
697 | imagefile.close(); | ||
698 | |||
699 | // Start writing output | ||
700 | std::cout << "Writing schema..." << std::endl; | ||
701 | |||
702 | sqlite3* ppdb; | ||
703 | if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK) | ||
704 | { | ||
705 | std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl; | ||
706 | print_usage(); | ||
707 | } | ||
708 | |||
709 | std::ifstream schemafile("schema.sql"); | ||
710 | if (!schemafile.is_open()) | ||
711 | { | ||
712 | std::cout << "Could not find schema file" << std::endl; | ||
713 | print_usage(); | ||
714 | } | ||
715 | |||
716 | std::stringstream schemabuilder; | ||
717 | for (;;) | ||
718 | { | ||
719 | std::string line; | ||
720 | if (!getline(schemafile, line)) | ||
721 | { | ||
722 | break; | ||
723 | } | ||
724 | |||
725 | if (line.back() == '\r') | ||
726 | { | ||
727 | line.pop_back(); | ||
728 | } | ||
729 | |||
730 | schemabuilder << line << std::endl; | ||
731 | } | ||
732 | |||
733 | std::string schema = schemabuilder.str(); | ||
734 | while (!schema.empty()) | ||
735 | { | ||
736 | std::string query; | ||
737 | int divider = schema.find(";"); | ||
738 | if (divider != std::string::npos) | ||
739 | { | ||
740 | query = schema.substr(0, divider+1); | ||
741 | schema = schema.substr(divider+2); | ||
742 | } else { | ||
743 | break; | ||
744 | } | 248 | } |
745 | 249 | ||
746 | sqlite3_stmt* schmstmt; | 250 | void generator::readImageNetUrls() |
747 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK) | ||
748 | { | 251 | { |
749 | db_error(ppdb, query); | 252 | // The ImageNet datafile is so large that it is unreasonable and |
750 | } | 253 | // unnecessary to read it into memory; instead, we will parse each line as |
751 | 254 | // we read it. This has the caveat that we cannot display a progress bar. | |
752 | if (sqlite3_step(schmstmt) != SQLITE_DONE) | 255 | std::cout << "Reading image counts from ImageNet..." << std::endl; |
753 | { | ||
754 | db_error(ppdb, query); | ||
755 | } | ||
756 | |||
757 | sqlite3_finalize(schmstmt); | ||
758 | } | ||
759 | |||
760 | std::cout << "Writing prepositions..." << std::endl; | ||
761 | std::ifstream prepfile("prepositions.txt"); | ||
762 | if (!prepfile.is_open()) | ||
763 | { | ||
764 | std::cout << "Could not find prepositions file" << std::endl; | ||
765 | print_usage(); | ||
766 | } | ||
767 | |||
768 | for (;;) | ||
769 | { | ||
770 | std::string line; | ||
771 | if (!getline(prepfile, line)) | ||
772 | { | ||
773 | break; | ||
774 | } | ||
775 | |||
776 | if (line.back() == '\r') | ||
777 | { | ||
778 | line.pop_back(); | ||
779 | } | ||
780 | |||
781 | std::regex relation("^([^:]+): (.+)"); | ||
782 | std::smatch relation_data; | ||
783 | std::regex_search(line, relation_data, relation); | ||
784 | std::string prep = relation_data[1]; | ||
785 | std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", "); | ||
786 | |||
787 | std::string query("INSERT INTO prepositions (form) VALUES (?)"); | ||
788 | sqlite3_stmt* ppstmt; | ||
789 | |||
790 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
791 | { | ||
792 | db_error(ppdb, query); | ||
793 | } | ||
794 | |||
795 | sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT); | ||
796 | |||
797 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
798 | { | ||
799 | db_error(ppdb, query); | ||
800 | } | ||
801 | |||
802 | sqlite3_finalize(ppstmt); | ||
803 | |||
804 | query = "SELECT last_insert_rowid()"; | ||
805 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
806 | { | ||
807 | db_error(ppdb, query); | ||
808 | } | ||
809 | |||
810 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
811 | { | ||
812 | db_error(ppdb, query); | ||
813 | } | ||
814 | |||
815 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
816 | sqlite3_finalize(ppstmt); | ||
817 | |||
818 | for (auto group : groups) | ||
819 | { | ||
820 | query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)"; | ||
821 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
822 | { | ||
823 | db_error(ppdb, query); | ||
824 | } | ||
825 | 256 | ||
826 | sqlite3_bind_int(ppstmt, 1, rowid); | 257 | std::ifstream file(imageNetPath_); |
827 | sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); | 258 | if (!file) |
828 | |||
829 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
830 | { | 259 | { |
831 | db_error(ppdb, query); | 260 | throw std::invalid_argument("Could not find file " + imageNetPath_); |
832 | } | 261 | } |
833 | |||
834 | sqlite3_finalize(ppstmt); | ||
835 | } | ||
836 | } | ||
837 | |||
838 | 262 | ||
839 | { | 263 | std::string line; |
840 | progress ppgs("Writing verbs...", verbs.size()); | 264 | while (std::getline(file, line)) |
841 | for (auto& mapping : verbs) | ||
842 | { | ||
843 | sqlite3_stmt* ppstmt; | ||
844 | std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)"); | ||
845 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
846 | { | ||
847 | db_error(ppdb, query); | ||
848 | } | ||
849 | |||
850 | sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT); | ||
851 | sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT); | ||
852 | sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT); | ||
853 | sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT); | ||
854 | sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT); | ||
855 | |||
856 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
857 | { | ||
858 | db_error(ppdb, query); | ||
859 | } | ||
860 | |||
861 | sqlite3_finalize(ppstmt); | ||
862 | |||
863 | std::string canonical(mapping.second.infinitive); | ||
864 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
865 | if (pronunciations.count(canonical) == 1) | ||
866 | { | 265 | { |
867 | query = "SELECT last_insert_rowid()"; | 266 | if (line.back() == '\r') |
868 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
869 | { | 267 | { |
870 | db_error(ppdb, query); | 268 | line.pop_back(); |
871 | } | 269 | } |
872 | 270 | ||
873 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | 271 | std::string wnid_s = line.substr(1, 8); |
272 | int wnid = stoi(wnid_s) + 100000000; | ||
273 | if (notionByWnid_.count(wnid)) | ||
874 | { | 274 | { |
875 | db_error(ppdb, query); | 275 | // We know that this notion has a wnid and is a noun. |
876 | } | 276 | notionByWnid_.at(wnid)->incrementNumOfImages(); |
877 | |||
878 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
879 | |||
880 | sqlite3_finalize(ppstmt); | ||
881 | |||
882 | mapping.second.id = rowid; | ||
883 | |||
884 | for (auto pronunciation : pronunciations[canonical]) | ||
885 | { | ||
886 | if (!pronunciation.rhyme.empty()) | ||
887 | { | ||
888 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
889 | } else { | ||
890 | query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
891 | } | ||
892 | |||
893 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
894 | { | ||
895 | db_error(ppdb, query); | ||
896 | } | ||
897 | |||
898 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
899 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
900 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
901 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
902 | |||
903 | if (!pronunciation.rhyme.empty()) | ||
904 | { | ||
905 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
906 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
907 | } | ||
908 | |||
909 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
910 | { | ||
911 | db_error(ppdb, query); | ||
912 | } | ||
913 | |||
914 | sqlite3_finalize(ppstmt); | ||
915 | } | 277 | } |
916 | } | 278 | } |
917 | |||
918 | ppgs.update(); | ||
919 | } | 279 | } |
920 | } | 280 | |
921 | 281 | void generator::readWordNetSenseKeys() | |
922 | { | ||
923 | progress ppgs("Writing verb frames...", groups.size()); | ||
924 | for (auto& mapping : groups) | ||
925 | { | 282 | { |
926 | std::list<json> roledatal; | 283 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl")); |
927 | std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { | 284 | progress ppgs("Reading sense keys from WordNet...", lines.size()); |
928 | json role; | ||
929 | role["type"] = r.first; | ||
930 | role["selrestrs"] = export_selrestrs(r.second); | ||
931 | |||
932 | return role; | ||
933 | }); | ||
934 | |||
935 | json roledata(roledatal); | ||
936 | std::string rdm = roledata.dump(); | ||
937 | |||
938 | sqlite3_stmt* ppstmt; | ||
939 | std::string query("INSERT INTO groups (data) VALUES (?)"); | ||
940 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
941 | { | ||
942 | db_error(ppdb, query); | ||
943 | } | ||
944 | |||
945 | sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT); | ||
946 | |||
947 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
948 | { | ||
949 | db_error(ppdb, query); | ||
950 | } | ||
951 | 285 | ||
952 | sqlite3_finalize(ppstmt); | 286 | for (std::string line : lines) |
953 | |||
954 | query = "SELECT last_insert_rowid()"; | ||
955 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
956 | { | ||
957 | db_error(ppdb, query); | ||
958 | } | ||
959 | |||
960 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
961 | { | ||
962 | db_error(ppdb, query); | ||
963 | } | ||
964 | |||
965 | int gid = sqlite3_column_int(ppstmt, 0); | ||
966 | sqlite3_finalize(ppstmt); | ||
967 | |||
968 | for (auto frame : mapping.second.frames) | ||
969 | { | 287 | { |
970 | std::list<json> fdatap; | 288 | ppgs.update(); |
971 | std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) { | ||
972 | json part; | ||
973 | |||
974 | switch (fp.type) | ||
975 | { | ||
976 | case framepart_t::type_t::np: | ||
977 | { | ||
978 | part["type"] = "np"; | ||
979 | part["role"] = fp.role; | ||
980 | part["selrestrs"] = export_selrestrs(fp.selrestrs); | ||
981 | part["synrestrs"] = fp.synrestrs; | ||
982 | |||
983 | break; | ||
984 | } | ||
985 | |||
986 | case framepart_t::type_t::pp: | ||
987 | { | ||
988 | part["type"] = "pp"; | ||
989 | part["values"] = fp.choices; | ||
990 | part["preprestrs"] = fp.preprestrs; | ||
991 | |||
992 | break; | ||
993 | } | ||
994 | |||
995 | case framepart_t::type_t::v: | ||
996 | { | ||
997 | part["type"] = "v"; | ||
998 | |||
999 | break; | ||
1000 | } | ||
1001 | |||
1002 | case framepart_t::type_t::adj: | ||
1003 | { | ||
1004 | part["type"] = "adj"; | ||
1005 | |||
1006 | break; | ||
1007 | } | ||
1008 | |||
1009 | case framepart_t::type_t::adv: | ||
1010 | { | ||
1011 | part["type"] = "adv"; | ||
1012 | |||
1013 | break; | ||
1014 | } | ||
1015 | |||
1016 | case framepart_t::type_t::lex: | ||
1017 | { | ||
1018 | part["type"] = "lex"; | ||
1019 | part["value"] = fp.lexval; | ||
1020 | |||
1021 | break; | ||
1022 | } | ||
1023 | } | ||
1024 | |||
1025 | return part; | ||
1026 | }); | ||
1027 | |||
1028 | json fdata(fdatap); | ||
1029 | std::string marshall = fdata.dump(); | ||
1030 | |||
1031 | query = "INSERT INTO frames (group_id, data) VALUES (?, ?)"; | ||
1032 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1033 | { | ||
1034 | db_error(ppdb, query); | ||
1035 | } | ||
1036 | |||
1037 | sqlite3_bind_int(ppstmt, 1, gid); | ||
1038 | sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT); | ||
1039 | 289 | ||
1040 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 290 | // We only actually need to lookup verbs by sense key so we'll just |
291 | // ignore everything that isn't a verb. | ||
292 | std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$"); | ||
293 | std::smatch relation_data; | ||
294 | if (!std::regex_search(line, relation_data, relation)) | ||
1041 | { | 295 | { |
1042 | db_error(ppdb, query); | 296 | continue; |
1043 | } | 297 | } |
298 | |||
299 | int synset_id = stoi(relation_data[1]); | ||
300 | int wnum = stoi(relation_data[2]); | ||
301 | std::string sense_key = relation_data[3]; | ||
1044 | 302 | ||
1045 | sqlite3_finalize(ppstmt); | 303 | // We are treating this mapping as injective, which is not entirely |
1046 | } | 304 | // accurate. First, the WordNet table contains duplicate rows, so those |
1047 | 305 | // need to be ignored. More importantly, a small number of sense keys | |
1048 | for (auto member : mapping.second.members) | 306 | // (one for each letter of the Latin alphabet, plus 9 other words) each |
1049 | { | 307 | // map to two different words in the same synset which differ only by |
1050 | if (verbs.count(member) == 1) | 308 | // capitalization. Luckily, none of these exceptions are verbs, so we |
309 | // can pretend that the mapping is injective. | ||
310 | if (!wnSenseKeys_.count(sense_key)) | ||
1051 | { | 311 | { |
1052 | auto& v = verbs[member]; | 312 | std::pair<int, int> lookup(synset_id, wnum); |
1053 | 313 | if (wordByWnidAndWnum_.count(lookup)) | |
1054 | query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)"; | ||
1055 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1056 | { | ||
1057 | db_error(ppdb, query); | ||
1058 | } | ||
1059 | |||
1060 | sqlite3_bind_int(ppstmt, 1, v.id); | ||
1061 | sqlite3_bind_int(ppstmt, 2, gid); | ||
1062 | |||
1063 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1064 | { | 314 | { |
1065 | db_error(ppdb, query); | 315 | wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup); |
1066 | } | 316 | } |
1067 | |||
1068 | sqlite3_finalize(ppstmt); | ||
1069 | } | 317 | } |
1070 | } | 318 | } |
1071 | |||
1072 | ppgs.update(); | ||
1073 | } | 319 | } |
1074 | } | 320 | |
1075 | 321 | void generator::readVerbNet() | |
1076 | // Get nouns/adjectives/adverbs from WordNet | ||
1077 | // Useful relations: | ||
1078 | // - s: master list | ||
1079 | // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness) | ||
1080 | // - at: variation (e.g. a measurement can be standard or nonstandard) | ||
1081 | // - der: derivation (e.g. happy/happily, happily/happy) | ||
1082 | // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue) | ||
1083 | // - ins: instantiation (do we need this? let's see) | ||
1084 | // - mm: member meronymy/holonymy (e.g. family/mother, family/child) | ||
1085 | // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire) | ||
1086 | // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber) | ||
1087 | // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska) | ||
1088 | // mannernymy (e.g. something done quickly is done in a manner that is quick) | ||
1089 | // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific)) | ||
1090 | // - sim: synonymy (e.g. cheerful/happy, happy/cheerful) | ||
1091 | // - syntax: positioning flags for some adjectives | ||
1092 | std::string wnpref {argv[3]}; | ||
1093 | if (wnpref.back() != '/') | ||
1094 | { | ||
1095 | wnpref += '/'; | ||
1096 | } | ||
1097 | |||
1098 | // s table | ||
1099 | { | ||
1100 | std::ifstream wnsfile(wnpref + "wn_s.pl"); | ||
1101 | if (!wnsfile.is_open()) | ||
1102 | { | 322 | { |
1103 | std::cout << "Invalid WordNet data directory." << std::endl; | 323 | std::cout << "Reading frames from VerbNet..." << std::endl; |
1104 | print_usage(); | ||
1105 | } | ||
1106 | 324 | ||
1107 | std::list<std::string> lines; | 325 | DIR* dir; |
1108 | for (;;) | 326 | if ((dir = opendir(verbNetPath_.c_str())) == nullptr) |
1109 | { | ||
1110 | std::string line; | ||
1111 | if (!getline(wnsfile, line)) | ||
1112 | { | 327 | { |
1113 | break; | 328 | throw std::invalid_argument("Invalid VerbNet data directory"); |
1114 | } | 329 | } |
1115 | 330 | ||
1116 | if (line.back() == '\r') | 331 | struct dirent* ent; |
1117 | { | 332 | while ((ent = readdir(dir)) != nullptr) |
1118 | line.pop_back(); | ||
1119 | } | ||
1120 | |||
1121 | lines.push_back(line); | ||
1122 | } | ||
1123 | |||
1124 | progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size()); | ||
1125 | for (auto line : lines) | ||
1126 | { | ||
1127 | ppgs.update(); | ||
1128 | |||
1129 | std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$"); | ||
1130 | std::smatch relation_data; | ||
1131 | if (!std::regex_search(line, relation_data, relation)) | ||
1132 | { | 333 | { |
1133 | continue; | 334 | std::string filename(verbNetPath_); |
1134 | } | 335 | |
336 | if (filename.back() != '/') | ||
337 | { | ||
338 | filename += '/'; | ||
339 | } | ||
1135 | 340 | ||
1136 | int synset_id = stoi(relation_data[1]); | 341 | filename += ent->d_name; |
1137 | int wnum = stoi(relation_data[2]); | ||
1138 | std::string word = relation_data[3]; | ||
1139 | size_t word_it; | ||
1140 | while ((word_it = word.find("''")) != std::string::npos) | ||
1141 | { | ||
1142 | word.erase(word_it, 1); | ||
1143 | } | ||
1144 | 342 | ||
1145 | std::string query; | 343 | if (filename.rfind(".xml") != filename.size() - 4) |
1146 | switch (synset_id / 100000000) | ||
1147 | { | ||
1148 | case 1: // Noun | ||
1149 | { | 344 | { |
1150 | if (nouns.count(word) == 1) | 345 | continue; |
1151 | { | ||
1152 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1153 | } else { | ||
1154 | query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)"; | ||
1155 | } | ||
1156 | |||
1157 | break; | ||
1158 | } | 346 | } |
1159 | 347 | ||
1160 | case 2: // Verb | 348 | xmlDocPtr doc = xmlParseFile(filename.c_str()); |
349 | if (doc == nullptr) | ||
1161 | { | 350 | { |
1162 | // Ignore | 351 | throw std::logic_error("Error opening " + filename); |
1163 | |||
1164 | break; | ||
1165 | } | 352 | } |
1166 | 353 | ||
1167 | case 3: // Adjective | 354 | xmlNodePtr top = xmlDocGetRootElement(doc); |
355 | if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS")))) | ||
1168 | { | 356 | { |
1169 | if (adjectives.count(word) == 1) | 357 | throw std::logic_error("Bad VerbNet file format: " + filename); |
1170 | { | ||
1171 | query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | ||
1172 | } else { | ||
1173 | query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)"; | ||
1174 | } | ||
1175 | |||
1176 | break; | ||
1177 | } | 358 | } |
1178 | 359 | ||
1179 | case 4: // Adverb | 360 | try |
1180 | { | 361 | { |
1181 | if (adjectives.count(word) == 1) | 362 | createGroup(top); |
1182 | { | 363 | } catch (const std::exception& e) |
1183 | query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; | 364 | { |
1184 | } else { | 365 | std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename)); |
1185 | query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)"; | ||
1186 | } | ||
1187 | |||
1188 | break; | ||
1189 | } | 366 | } |
1190 | } | 367 | } |
368 | |||
369 | closedir(dir); | ||
370 | } | ||
1191 | 371 | ||
1192 | sqlite3_stmt* ppstmt; | 372 | void generator::readAgidInflections() |
1193 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | 373 | { |
374 | std::list<std::string> lines(readFile(agidPath_)); | ||
375 | progress ppgs("Reading inflections from AGID...", lines.size()); | ||
376 | |||
377 | for (std::string line : lines) | ||
1194 | { | 378 | { |
1195 | db_error(ppdb, query); | 379 | ppgs.update(); |
1196 | } | 380 | |
381 | int divider = line.find_first_of(" "); | ||
382 | std::string infinitive = line.substr(0, divider); | ||
383 | line = line.substr(divider+1); | ||
384 | char type = line[0]; | ||
1197 | 385 | ||
1198 | sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); | 386 | if (line[1] == '?') |
1199 | switch (synset_id / 100000000) | ||
1200 | { | ||
1201 | case 1: // Noun | ||
1202 | { | 387 | { |
1203 | sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { | 388 | line.erase(0, 4); |
1204 | return isupper(ch); | 389 | } else { |
1205 | }) ? 1 : 0)); | 390 | line.erase(0, 3); |
1206 | |||
1207 | sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size()); | ||
1208 | sqlite3_bind_int(ppstmt, 4, images[synset_id]); | ||
1209 | sqlite3_bind_int(ppstmt, 5, synset_id); | ||
1210 | |||
1211 | if (nouns.count(word) == 1) | ||
1212 | { | ||
1213 | sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT); | ||
1214 | } | ||
1215 | |||
1216 | break; | ||
1217 | } | 391 | } |
1218 | 392 | ||
1219 | case 3: // Adjective | 393 | if (!lemmaByBaseForm_.count(infinitive) && (type != 'V')) |
1220 | case 4: // Adverb | ||
1221 | { | 394 | { |
1222 | sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); | 395 | continue; |
1223 | 396 | } | |
1224 | if (adjectives.count(word) == 1) | 397 | |
398 | lemma& curLemma = lookupOrCreateLemma(infinitive); | ||
399 | |||
400 | auto forms = split<std::vector<std::string>>(line, " | "); | ||
401 | for (std::string& inflForm : forms) | ||
402 | { | ||
403 | int sympos = inflForm.find_first_of(",?"); | ||
404 | if (sympos != std::string::npos) | ||
1225 | { | 405 | { |
1226 | sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); | 406 | inflForm = inflForm.substr(0, sympos); |
1227 | sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT); | ||
1228 | } | 407 | } |
1229 | |||
1230 | break; | ||
1231 | } | 408 | } |
1232 | } | ||
1233 | 409 | ||
1234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 410 | switch (type) |
1235 | { | ||
1236 | db_error(ppdb, query); | ||
1237 | } | ||
1238 | |||
1239 | sqlite3_finalize(ppstmt); | ||
1240 | |||
1241 | query = "SELECT last_insert_rowid()"; | ||
1242 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1243 | { | ||
1244 | db_error(ppdb, query); | ||
1245 | } | ||
1246 | |||
1247 | if (sqlite3_step(ppstmt) != SQLITE_ROW) | ||
1248 | { | ||
1249 | db_error(ppdb, query); | ||
1250 | } | ||
1251 | |||
1252 | int rowid = sqlite3_column_int(ppstmt, 0); | ||
1253 | wn[synset_id][wnum] = rowid; | ||
1254 | |||
1255 | sqlite3_finalize(ppstmt); | ||
1256 | |||
1257 | std::string canonical(word); | ||
1258 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); | ||
1259 | if (pronunciations.count(canonical) == 1) | ||
1260 | { | ||
1261 | for (auto pronunciation : pronunciations[canonical]) | ||
1262 | { | 411 | { |
1263 | switch (synset_id / 100000000) | 412 | case 'V': |
1264 | { | 413 | { |
1265 | case 1: // Noun | 414 | if (forms.size() == 4) |
1266 | { | 415 | { |
1267 | if (!pronunciation.rhyme.empty()) | 416 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1268 | { | 417 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1])); |
1269 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 418 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2])); |
1270 | } else { | 419 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3])); |
1271 | query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 420 | } else if (forms.size() == 3) |
1272 | } | ||
1273 | |||
1274 | break; | ||
1275 | } | ||
1276 | |||
1277 | case 3: // Adjective | ||
1278 | { | 421 | { |
1279 | if (!pronunciation.rhyme.empty()) | 422 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
1280 | { | 423 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0])); |
1281 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | 424 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1])); |
1282 | } else { | 425 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2])); |
1283 | query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | 426 | } else if (forms.size() == 8) |
1284 | } | 427 | { |
1285 | 428 | // As of AGID 2014.08.11, this is only "to be" | |
1286 | break; | 429 | curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0])); |
430 | curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2])); | ||
431 | curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3])); | ||
432 | curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4])); | ||
433 | } else { | ||
434 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
435 | // - may and shall do not conjugate the way we want them to | ||
436 | // - methinks only has a past tense and is an outlier | ||
437 | // - wit has five forms, and is archaic/obscure enough that we can ignore it for now | ||
438 | std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1287 | } | 439 | } |
1288 | 440 | ||
1289 | case 4: // Adverb | 441 | // For verbs in particular, we sometimes create a notion and a word |
442 | // from inflection data. Specifically, if there are not yet any | ||
443 | // verbs existing that have the same infinitive form. "Yet" means | ||
444 | // that this verb appears in the AGID data but not in either WordNet | ||
445 | // or VerbNet. | ||
446 | if (!wordsByBaseForm_.count(infinitive) | ||
447 | || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) { | ||
448 | return w->getNotion().getPartOfSpeech() == part_of_speech::verb; | ||
449 | })) | ||
1290 | { | 450 | { |
1291 | if (!pronunciation.rhyme.empty()) | 451 | notion& n = createNotion(part_of_speech::verb); |
1292 | { | 452 | createWord(n, curLemma); |
1293 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; | ||
1294 | } else { | ||
1295 | query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; | ||
1296 | } | ||
1297 | |||
1298 | break; | ||
1299 | } | 453 | } |
1300 | } | ||
1301 | |||
1302 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1303 | { | ||
1304 | db_error(ppdb, query); | ||
1305 | } | ||
1306 | |||
1307 | sqlite3_bind_int(ppstmt, 1, rowid); | ||
1308 | sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT); | ||
1309 | sqlite3_bind_int(ppstmt, 3, pronunciation.syllables); | ||
1310 | sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT); | ||
1311 | |||
1312 | if (!pronunciation.rhyme.empty()) | ||
1313 | { | ||
1314 | sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT); | ||
1315 | sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT); | ||
1316 | } | ||
1317 | 454 | ||
1318 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 455 | break; |
1319 | { | ||
1320 | db_error(ppdb, query); | ||
1321 | } | 456 | } |
1322 | |||
1323 | sqlite3_finalize(ppstmt); | ||
1324 | } | ||
1325 | } | ||
1326 | } | ||
1327 | } | ||
1328 | |||
1329 | // While we're working on s | ||
1330 | { | ||
1331 | progress ppgs("Writing word synonyms...", wn.size()); | ||
1332 | for (auto sense : wn) | ||
1333 | { | ||
1334 | ppgs.update(); | ||
1335 | 457 | ||
1336 | for (auto word1 : sense.second) | 458 | case 'A': |
1337 | { | ||
1338 | for (auto word2 : sense.second) | ||
1339 | { | ||
1340 | if (word1 != word2) | ||
1341 | { | 459 | { |
1342 | std::string query; | 460 | if (forms.size() == 2) |
1343 | switch (sense.first / 100000000) | ||
1344 | { | 461 | { |
1345 | case 1: // Noun | 462 | curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0])); |
1346 | { | 463 | curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1])); |
1347 | query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | 464 | } else { |
1348 | 465 | // As of AGID 2014.08.11, this is only "only", which has only the form "onliest" | |
1349 | break; | 466 | std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; |
1350 | } | 467 | } |
1351 | |||
1352 | case 2: // Verb | ||
1353 | { | ||
1354 | // Ignore | ||
1355 | |||
1356 | break; | ||
1357 | } | ||
1358 | |||
1359 | case 3: // Adjective | ||
1360 | { | ||
1361 | query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1362 | 468 | ||
1363 | break; | 469 | break; |
1364 | } | 470 | } |
1365 | 471 | ||
1366 | case 4: // Adverb | 472 | case 'N': |
1367 | { | 473 | { |
1368 | query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | 474 | if (forms.size() == 1) |
1369 | |||
1370 | break; | ||
1371 | } | ||
1372 | } | ||
1373 | |||
1374 | sqlite3_stmt* ppstmt; | ||
1375 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1376 | { | ||
1377 | db_error(ppdb, query); | ||
1378 | } | ||
1379 | |||
1380 | sqlite3_bind_int(ppstmt, 1, word1.second); | ||
1381 | sqlite3_bind_int(ppstmt, 2, word2.second); | ||
1382 | |||
1383 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1384 | { | 475 | { |
1385 | db_error(ppdb, query); | 476 | curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0])); |
477 | } else { | ||
478 | // As of AGID 2014.08.11, this is non-existent. | ||
479 | std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; | ||
1386 | } | 480 | } |
1387 | 481 | ||
1388 | sqlite3_finalize(ppstmt); | 482 | break; |
1389 | } | 483 | } |
1390 | } | 484 | } |
1391 | } | 485 | } |
1392 | } | 486 | } |
1393 | } | ||
1394 | |||
1395 | // ant table | ||
1396 | { | ||
1397 | std::ifstream wnantfile(wnpref + "wn_ant.pl"); | ||
1398 | if (!wnantfile.is_open()) | ||
1399 | { | ||
1400 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1401 | print_usage(); | ||
1402 | } | ||
1403 | |||
1404 | std::list<std::string> lines; | ||
1405 | for (;;) | ||
1406 | { | ||
1407 | std::string line; | ||
1408 | if (!getline(wnantfile, line)) | ||
1409 | { | ||
1410 | break; | ||
1411 | } | ||
1412 | 487 | ||
1413 | if (line.back() == '\r') | 488 | void generator::readPrepositions() |
1414 | { | ||
1415 | line.pop_back(); | ||
1416 | } | ||
1417 | |||
1418 | lines.push_back(line); | ||
1419 | } | ||
1420 | |||
1421 | progress ppgs("Writing antonyms...", lines.size()); | ||
1422 | for (auto line : lines) | ||
1423 | { | 489 | { |
1424 | ppgs.update(); | 490 | std::list<std::string> lines(readFile("prepositions.txt")); |
491 | progress ppgs("Reading prepositions...", lines.size()); | ||
1425 | 492 | ||
1426 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | 493 | for (std::string line : lines) |
1427 | std::smatch relation_data; | ||
1428 | if (!std::regex_search(line, relation_data, relation)) | ||
1429 | { | ||
1430 | continue; | ||
1431 | } | ||
1432 | |||
1433 | int synset_id_1 = stoi(relation_data[1]); | ||
1434 | int wnum_1 = stoi(relation_data[2]); | ||
1435 | int synset_id_2 = stoi(relation_data[3]); | ||
1436 | int wnum_2 = stoi(relation_data[4]); | ||
1437 | |||
1438 | std::string query; | ||
1439 | switch (synset_id_1 / 100000000) | ||
1440 | { | 494 | { |
1441 | case 1: // Noun | 495 | ppgs.update(); |
1442 | { | ||
1443 | query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1444 | 496 | ||
1445 | break; | 497 | std::regex relation("^([^:]+): (.+)"); |
1446 | } | 498 | std::smatch relation_data; |
1447 | 499 | std::regex_search(line, relation_data, relation); | |
1448 | case 2: // Verb | 500 | std::string prep = relation_data[1]; |
1449 | { | 501 | auto groups = split<std::list<std::string>>(relation_data[2], ", "); |
1450 | // Ignore | ||
1451 | 502 | ||
1452 | break; | 503 | notion& n = createNotion(part_of_speech::preposition); |
1453 | } | 504 | lemma& l = lookupOrCreateLemma(prep); |
1454 | 505 | word& w = createWord(n, l); | |
1455 | case 3: // Adjective | ||
1456 | { | ||
1457 | query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"; | ||
1458 | 506 | ||
1459 | break; | 507 | n.setPrepositionGroups(groups); |
1460 | } | ||
1461 | |||
1462 | case 4: // Adverb | ||
1463 | { | ||
1464 | query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1465 | |||
1466 | break; | ||
1467 | } | ||
1468 | } | ||
1469 | |||
1470 | sqlite3_stmt* ppstmt; | ||
1471 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
1472 | { | ||
1473 | db_error(ppdb, query); | ||
1474 | } | ||
1475 | |||
1476 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1477 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1478 | |||
1479 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1480 | { | ||
1481 | db_error(ppdb, query); | ||
1482 | } | ||
1483 | |||
1484 | sqlite3_finalize(ppstmt); | ||
1485 | } | ||
1486 | } | ||
1487 | |||
1488 | // at table | ||
1489 | { | ||
1490 | std::ifstream wnatfile(wnpref + "wn_at.pl"); | ||
1491 | if (!wnatfile.is_open()) | ||
1492 | { | ||
1493 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1494 | print_usage(); | ||
1495 | } | ||
1496 | |||
1497 | std::list<std::string> lines; | ||
1498 | for (;;) | ||
1499 | { | ||
1500 | std::string line; | ||
1501 | if (!getline(wnatfile, line)) | ||
1502 | { | ||
1503 | break; | ||
1504 | } | 508 | } |
1505 | |||
1506 | if (line.back() == '\r') | ||
1507 | { | ||
1508 | line.pop_back(); | ||
1509 | } | ||
1510 | |||
1511 | lines.push_back(line); | ||
1512 | } | 509 | } |
1513 | 510 | ||
1514 | progress ppgs("Writing variations...", lines.size()); | 511 | void generator::readCmudictPronunciations() |
1515 | for (auto line : lines) | ||
1516 | { | 512 | { |
1517 | ppgs.update(); | 513 | std::list<std::string> lines(readFile(cmudictPath_)); |
514 | progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); | ||
1518 | 515 | ||
1519 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); | 516 | for (std::string line : lines) |
1520 | std::smatch relation_data; | ||
1521 | if (!std::regex_search(line, relation_data, relation)) | ||
1522 | { | 517 | { |
1523 | continue; | 518 | ppgs.update(); |
1524 | } | 519 | |
1525 | 520 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | |
1526 | int synset_id_1 = stoi(relation_data[1]); | 521 | std::smatch phoneme_data; |
1527 | int synset_id_2 = stoi(relation_data[2]); | 522 | if (std::regex_search(line, phoneme_data, phoneme)) |
1528 | std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)"); | ||
1529 | |||
1530 | for (auto mapping1 : wn[synset_id_1]) | ||
1531 | { | ||
1532 | for (auto mapping2 : wn[synset_id_2]) | ||
1533 | { | 523 | { |
1534 | sqlite3_stmt* ppstmt; | 524 | std::string canonical(phoneme_data[1]); |
1535 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 525 | std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); |
1536 | { | ||
1537 | db_error(ppdb, query); | ||
1538 | } | ||
1539 | |||
1540 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | ||
1541 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1542 | 526 | ||
1543 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 527 | if (!formByText_.count(canonical)) |
1544 | { | 528 | { |
1545 | db_error(ppdb, query); | 529 | continue; |
1546 | } | 530 | } |
1547 | 531 | ||
1548 | sqlite3_finalize(ppstmt); | 532 | std::string phonemes = phoneme_data[2]; |
533 | pronunciations_.emplace_back(phonemes); | ||
534 | pronunciation& p = pronunciations_.back(); | ||
535 | formByText_.at(canonical)->addPronunciation(p); | ||
1549 | } | 536 | } |
1550 | } | 537 | } |
1551 | } | 538 | } |
1552 | } | ||
1553 | |||
1554 | // der table | ||
1555 | { | ||
1556 | std::ifstream wnderfile(wnpref + "wn_der.pl"); | ||
1557 | if (!wnderfile.is_open()) | ||
1558 | { | ||
1559 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1560 | print_usage(); | ||
1561 | } | ||
1562 | 539 | ||
1563 | std::list<std::string> lines; | 540 | void generator::writeSchema() |
1564 | for (;;) | ||
1565 | { | 541 | { |
1566 | std::string line; | 542 | std::ifstream file("schema.sql"); |
1567 | if (!getline(wnderfile, line)) | 543 | if (!file) |
1568 | { | 544 | { |
1569 | break; | 545 | throw std::invalid_argument("Could not find database schema"); |
1570 | } | 546 | } |
1571 | 547 | ||
1572 | if (line.back() == '\r') | 548 | std::ostringstream schemaBuilder; |
549 | std::string line; | ||
550 | while (std::getline(file, line)) | ||
1573 | { | 551 | { |
1574 | line.pop_back(); | 552 | if (line.back() == '\r') |
553 | { | ||
554 | line.pop_back(); | ||
555 | } | ||
556 | |||
557 | schemaBuilder << line; | ||
1575 | } | 558 | } |
1576 | 559 | ||
1577 | lines.push_back(line); | 560 | std::string schema = schemaBuilder.str(); |
561 | auto queries = split<std::list<std::string>>(schema, ";"); | ||
562 | progress ppgs("Writing database schema...", queries.size()); | ||
563 | for (std::string query : queries) | ||
564 | { | ||
565 | if (!queries.empty()) | ||
566 | { | ||
567 | db_.runQuery(query); | ||
568 | } | ||
569 | |||
570 | ppgs.update(); | ||
571 | } | ||
1578 | } | 572 | } |
1579 | 573 | ||
1580 | progress ppgs("Writing morphological derivation...", lines.size()); | 574 | void generator::dumpObjects() |
1581 | for (auto line : lines) | ||
1582 | { | 575 | { |
1583 | ppgs.update(); | ||
1584 | |||
1585 | std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); | ||
1586 | std::smatch relation_data; | ||
1587 | if (!std::regex_search(line, relation_data, relation)) | ||
1588 | { | 576 | { |
1589 | continue; | 577 | progress ppgs("Writing notions...", notions_.size()); |
578 | |||
579 | for (notion& n : notions_) | ||
580 | { | ||
581 | db_ << n; | ||
582 | |||
583 | ppgs.update(); | ||
584 | } | ||
1590 | } | 585 | } |
1591 | 586 | ||
1592 | int synset_id_1 = stoi(relation_data[1]); | ||
1593 | int wnum_1 = stoi(relation_data[2]); | ||
1594 | int synset_id_2 = stoi(relation_data[3]); | ||
1595 | int wnum_2 = stoi(relation_data[4]); | ||
1596 | std::string query; | ||
1597 | switch (synset_id_1 / 100000000) | ||
1598 | { | 587 | { |
1599 | case 1: // Noun | 588 | progress ppgs("Writing words...", words_.size()); |
589 | |||
590 | for (word& w : words_) | ||
1600 | { | 591 | { |
1601 | switch (synset_id_2 / 100000000) | 592 | db_ << w; |
1602 | { | ||
1603 | case 1: // Noun | ||
1604 | { | ||
1605 | query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)"; | ||
1606 | break; | ||
1607 | } | ||
1608 | |||
1609 | case 3: // Adjective | ||
1610 | { | ||
1611 | query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)"; | ||
1612 | break; | ||
1613 | } | ||
1614 | |||
1615 | case 4: // Adverb | ||
1616 | { | ||
1617 | query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)"; | ||
1618 | break; | ||
1619 | } | ||
1620 | } | ||
1621 | 593 | ||
1622 | break; | 594 | ppgs.update(); |
1623 | } | 595 | } |
596 | } | ||
597 | |||
598 | { | ||
599 | progress ppgs("Writing lemmas...", lemmas_.size()); | ||
1624 | 600 | ||
1625 | case 3: // Adjective | 601 | for (lemma& l : lemmas_) |
1626 | { | 602 | { |
1627 | switch (synset_id_2 / 100000000) | 603 | db_ << l; |
1628 | { | ||
1629 | case 1: // Noun | ||
1630 | { | ||
1631 | query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)"; | ||
1632 | break; | ||
1633 | } | ||
1634 | |||
1635 | case 3: // Adjective | ||
1636 | { | ||
1637 | query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)"; | ||
1638 | break; | ||
1639 | } | ||
1640 | |||
1641 | case 4: // Adverb | ||
1642 | { | ||
1643 | query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)"; | ||
1644 | break; | ||
1645 | } | ||
1646 | } | ||
1647 | 604 | ||
1648 | break; | 605 | ppgs.update(); |
1649 | } | 606 | } |
607 | } | ||
608 | |||
609 | { | ||
610 | progress ppgs("Writing forms...", forms_.size()); | ||
1650 | 611 | ||
1651 | case 4: // Adverb | 612 | for (form& f : forms_) |
1652 | { | 613 | { |
1653 | switch (synset_id_2 / 100000000) | 614 | db_ << f; |
1654 | { | ||
1655 | case 1: // Noun | ||
1656 | { | ||
1657 | query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)"; | ||
1658 | break; | ||
1659 | } | ||
1660 | |||
1661 | case 3: // Adjective | ||
1662 | { | ||
1663 | query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)"; | ||
1664 | break; | ||
1665 | } | ||
1666 | |||
1667 | case 4: // Adverb | ||
1668 | { | ||
1669 | query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)"; | ||
1670 | break; | ||
1671 | } | ||
1672 | } | ||
1673 | 615 | ||
1674 | break; | 616 | ppgs.update(); |
1675 | } | 617 | } |
1676 | } | 618 | } |
1677 | 619 | ||
1678 | sqlite3_stmt* ppstmt; | ||
1679 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
1680 | { | 620 | { |
1681 | db_error(ppdb, query); | 621 | progress ppgs("Writing pronunciations...", pronunciations_.size()); |
622 | |||
623 | for (pronunciation& p : pronunciations_) | ||
624 | { | ||
625 | db_ << p; | ||
626 | |||
627 | ppgs.update(); | ||
628 | } | ||
1682 | } | 629 | } |
1683 | 630 | ||
1684 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
1685 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
1686 | |||
1687 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1688 | { | 631 | { |
1689 | db_error(ppdb, query); | 632 | progress ppgs("Writing verb groups...", groups_.size()); |
633 | |||
634 | for (group& g : groups_) | ||
635 | { | ||
636 | db_ << g; | ||
637 | |||
638 | ppgs.update(); | ||
639 | } | ||
1690 | } | 640 | } |
1691 | 641 | ||
1692 | sqlite3_finalize(ppstmt); | ||
1693 | } | ||
1694 | } | ||
1695 | |||
1696 | // hyp table | ||
1697 | { | ||
1698 | std::ifstream wnhypfile(wnpref + "wn_hyp.pl"); | ||
1699 | if (!wnhypfile.is_open()) | ||
1700 | { | ||
1701 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1702 | print_usage(); | ||
1703 | } | ||
1704 | |||
1705 | std::list<std::string> lines; | ||
1706 | for (;;) | ||
1707 | { | ||
1708 | std::string line; | ||
1709 | if (!getline(wnhypfile, line)) | ||
1710 | { | ||
1711 | break; | ||
1712 | } | ||
1713 | |||
1714 | if (line.back() == '\r') | ||
1715 | { | 642 | { |
1716 | line.pop_back(); | 643 | progress ppgs("Writing verb frames...", frames_.size()); |
644 | |||
645 | for (frame& f : frames_) | ||
646 | { | ||
647 | db_ << f; | ||
648 | |||
649 | ppgs.update(); | ||
650 | } | ||
1717 | } | 651 | } |
1718 | |||
1719 | lines.push_back(line); | ||
1720 | } | 652 | } |
1721 | 653 | ||
1722 | progress ppgs("Writing hypernyms...", lines.size()); | 654 | void generator::readWordNetAntonymy() |
1723 | for (auto line : lines) | ||
1724 | { | 655 | { |
1725 | ppgs.update(); | 656 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); |
1726 | 657 | progress ppgs("Writing antonyms...", lines.size()); | |
1727 | std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); | 658 | for (auto line : lines) |
1728 | std::smatch relation_data; | ||
1729 | if (!std::regex_search(line, relation_data, relation)) | ||
1730 | { | 659 | { |
1731 | continue; | 660 | ppgs.update(); |
1732 | } | ||
1733 | |||
1734 | int synset_id_1 = stoi(relation_data[1]); | ||
1735 | int synset_id_2 = stoi(relation_data[2]); | ||
1736 | std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)"); | ||
1737 | 661 | ||
1738 | for (auto mapping1 : wn[synset_id_1]) | 662 | std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); |
1739 | { | 663 | std::smatch relation_data; |
1740 | for (auto mapping2 : wn[synset_id_2]) | 664 | if (!std::regex_search(line, relation_data, relation)) |
1741 | { | 665 | { |
1742 | sqlite3_stmt* ppstmt; | 666 | continue; |
1743 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 667 | } |
1744 | { | 668 | |
1745 | db_error(ppdb, query); | 669 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
1746 | } | 670 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1747 | 671 | ||
1748 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 672 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) |
1749 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 673 | { |
674 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
675 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1750 | 676 | ||
1751 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 677 | std::list<field> fields; |
1752 | { | 678 | fields.emplace_back("antonym_1_id", word1.getId()); |
1753 | db_error(ppdb, query); | 679 | fields.emplace_back("antonym_2_id", word2.getId()); |
1754 | } | ||
1755 | 680 | ||
1756 | sqlite3_finalize(ppstmt); | 681 | db_.insertIntoTable("antonymy", std::move(fields)); |
1757 | } | 682 | } |
1758 | } | 683 | } |
1759 | } | 684 | } |
1760 | } | ||
1761 | |||
1762 | // ins table | ||
1763 | { | ||
1764 | std::ifstream wninsfile(wnpref + "wn_ins.pl"); | ||
1765 | if (!wninsfile.is_open()) | ||
1766 | { | ||
1767 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1768 | print_usage(); | ||
1769 | } | ||
1770 | |||
1771 | std::list<std::string> lines; | ||
1772 | for (;;) | ||
1773 | { | ||
1774 | std::string line; | ||
1775 | if (!getline(wninsfile, line)) | ||
1776 | { | ||
1777 | break; | ||
1778 | } | ||
1779 | 685 | ||
1780 | if (line.back() == '\r') | 686 | void generator::readWordNetVariation() |
687 | { | ||
688 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl")); | ||
689 | progress ppgs("Writing variation...", lines.size()); | ||
690 | for (auto line : lines) | ||
1781 | { | 691 | { |
1782 | line.pop_back(); | 692 | ppgs.update(); |
1783 | } | ||
1784 | 693 | ||
1785 | lines.push_back(line); | 694 | std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); |
695 | std::smatch relation_data; | ||
696 | if (!std::regex_search(line, relation_data, relation)) | ||
697 | { | ||
698 | continue; | ||
699 | } | ||
700 | |||
701 | int lookup1 = std::stoi(relation_data[1]); | ||
702 | int lookup2 = std::stoi(relation_data[2]); | ||
703 | |||
704 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
705 | { | ||
706 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
707 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
708 | |||
709 | std::list<field> fields; | ||
710 | fields.emplace_back("noun_id", notion1.getId()); | ||
711 | fields.emplace_back("adjective_id", notion2.getId()); | ||
712 | |||
713 | db_.insertIntoTable("variation", std::move(fields)); | ||
714 | } | ||
715 | } | ||
1786 | } | 716 | } |
1787 | 717 | ||
1788 | progress ppgs("Writing instantiations...", lines.size()); | 718 | void generator::readWordNetClasses() |
1789 | for (auto line : lines) | ||
1790 | { | 719 | { |
1791 | ppgs.update(); | 720 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); |
1792 | 721 | progress ppgs("Writing usage, topicality, and regionality...", lines.size()); | |
1793 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); | 722 | for (auto line : lines) |
1794 | std::smatch relation_data; | ||
1795 | if (!std::regex_search(line, relation_data, relation)) | ||
1796 | { | 723 | { |
1797 | continue; | 724 | ppgs.update(); |
1798 | } | ||
1799 | |||
1800 | int synset_id_1 = stoi(relation_data[1]); | ||
1801 | int synset_id_2 = stoi(relation_data[2]); | ||
1802 | std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)"); | ||
1803 | 725 | ||
1804 | for (auto mapping1 : wn[synset_id_1]) | 726 | std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\."); |
1805 | { | 727 | std::smatch relation_data; |
1806 | for (auto mapping2 : wn[synset_id_2]) | 728 | if (!std::regex_search(line, relation_data, relation)) |
729 | { | ||
730 | continue; | ||
731 | } | ||
732 | |||
733 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
734 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
735 | std::string class_type = relation_data[5]; | ||
736 | |||
737 | std::string table_name; | ||
738 | if (class_type == "t") | ||
739 | { | ||
740 | table_name += "topicality"; | ||
741 | } else if (class_type == "u") | ||
742 | { | ||
743 | table_name += "usage"; | ||
744 | } else if (class_type == "r") | ||
745 | { | ||
746 | table_name += "regionality"; | ||
747 | } | ||
748 | |||
749 | std::list<int> leftJoin; | ||
750 | std::list<int> rightJoin; | ||
751 | |||
752 | if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first))) | ||
1807 | { | 753 | { |
1808 | sqlite3_stmt* ppstmt; | 754 | std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) { |
1809 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 755 | return w->getId(); |
756 | }); | ||
757 | } else if (wordByWnidAndWnum_.count(lookup1)) { | ||
758 | leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId()); | ||
759 | } | ||
760 | |||
761 | if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first))) | ||
762 | { | ||
763 | std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) { | ||
764 | return w->getId(); | ||
765 | }); | ||
766 | } else if (wordByWnidAndWnum_.count(lookup2)) { | ||
767 | rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId()); | ||
768 | } | ||
769 | |||
770 | for (int word1 : leftJoin) | ||
771 | { | ||
772 | for (int word2 : rightJoin) | ||
1810 | { | 773 | { |
1811 | db_error(ppdb, query); | 774 | std::list<field> fields; |
1812 | } | 775 | fields.emplace_back("term_id", word1); |
776 | fields.emplace_back("domain_id", word2); | ||
1813 | 777 | ||
1814 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 778 | db_.insertIntoTable(table_name, std::move(fields)); |
1815 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | ||
1816 | |||
1817 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
1818 | { | ||
1819 | db_error(ppdb, query); | ||
1820 | } | 779 | } |
1821 | |||
1822 | sqlite3_finalize(ppstmt); | ||
1823 | } | 780 | } |
1824 | } | 781 | } |
1825 | } | 782 | } |
1826 | } | ||
1827 | |||
1828 | // mm table | ||
1829 | { | ||
1830 | std::ifstream wnmmfile(wnpref + "wn_mm.pl"); | ||
1831 | if (!wnmmfile.is_open()) | ||
1832 | { | ||
1833 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1834 | print_usage(); | ||
1835 | } | ||
1836 | |||
1837 | std::list<std::string> lines; | ||
1838 | for (;;) | ||
1839 | { | ||
1840 | std::string line; | ||
1841 | if (!getline(wnmmfile, line)) | ||
1842 | { | ||
1843 | break; | ||
1844 | } | ||
1845 | 783 | ||
1846 | if (line.back() == '\r') | 784 | void generator::readWordNetCausality() |
785 | { | ||
786 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl")); | ||
787 | progress ppgs("Writing causality...", lines.size()); | ||
788 | for (auto line : lines) | ||
1847 | { | 789 | { |
1848 | line.pop_back(); | 790 | ppgs.update(); |
1849 | } | ||
1850 | 791 | ||
1851 | lines.push_back(line); | 792 | std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\."); |
793 | std::smatch relation_data; | ||
794 | if (!std::regex_search(line, relation_data, relation)) | ||
795 | { | ||
796 | continue; | ||
797 | } | ||
798 | |||
799 | int lookup1 = std::stoi(relation_data[1]); | ||
800 | int lookup2 = std::stoi(relation_data[2]); | ||
801 | |||
802 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
803 | { | ||
804 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
805 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
806 | |||
807 | std::list<field> fields; | ||
808 | fields.emplace_back("effect_id", notion1.getId()); | ||
809 | fields.emplace_back("cause_id", notion2.getId()); | ||
810 | |||
811 | db_.insertIntoTable("causality", std::move(fields)); | ||
812 | } | ||
813 | } | ||
1852 | } | 814 | } |
1853 | 815 | ||
1854 | progress ppgs("Writing member meronyms...", lines.size()); | 816 | void generator::readWordNetEntailment() |
1855 | for (auto line : lines) | ||
1856 | { | 817 | { |
1857 | ppgs.update(); | 818 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl")); |
1858 | 819 | progress ppgs("Writing entailment...", lines.size()); | |
1859 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); | 820 | for (auto line : lines) |
1860 | std::smatch relation_data; | ||
1861 | if (!std::regex_search(line, relation_data, relation)) | ||
1862 | { | 821 | { |
1863 | continue; | 822 | ppgs.update(); |
1864 | } | ||
1865 | 823 | ||
1866 | int synset_id_1 = stoi(relation_data[1]); | 824 | std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\."); |
1867 | int synset_id_2 = stoi(relation_data[2]); | 825 | std::smatch relation_data; |
1868 | std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | 826 | if (!std::regex_search(line, relation_data, relation)) |
1869 | |||
1870 | for (auto mapping1 : wn[synset_id_1]) | ||
1871 | { | ||
1872 | for (auto mapping2 : wn[synset_id_2]) | ||
1873 | { | 827 | { |
1874 | sqlite3_stmt* ppstmt; | 828 | continue; |
1875 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 829 | } |
1876 | { | 830 | |
1877 | db_error(ppdb, query); | 831 | int lookup1 = std::stoi(relation_data[1]); |
1878 | } | 832 | int lookup2 = std::stoi(relation_data[2]); |
1879 | 833 | ||
1880 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 834 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
1881 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 835 | { |
836 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
837 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1882 | 838 | ||
1883 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 839 | std::list<field> fields; |
1884 | { | 840 | fields.emplace_back("given_id", notion1.getId()); |
1885 | db_error(ppdb, query); | 841 | fields.emplace_back("entailment_id", notion2.getId()); |
1886 | } | ||
1887 | 842 | ||
1888 | sqlite3_finalize(ppstmt); | 843 | db_.insertIntoTable("entailment", std::move(fields)); |
1889 | } | 844 | } |
1890 | } | 845 | } |
1891 | } | 846 | } |
1892 | } | 847 | |
1893 | 848 | void generator::readWordNetHypernymy() | |
1894 | // ms table | ||
1895 | { | ||
1896 | std::ifstream wnmsfile(wnpref + "wn_ms.pl"); | ||
1897 | if (!wnmsfile.is_open()) | ||
1898 | { | ||
1899 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1900 | print_usage(); | ||
1901 | } | ||
1902 | |||
1903 | std::list<std::string> lines; | ||
1904 | for (;;) | ||
1905 | { | 849 | { |
1906 | std::string line; | 850 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl")); |
1907 | if (!getline(wnmsfile, line)) | 851 | progress ppgs("Writing hypernymy...", lines.size()); |
852 | for (auto line : lines) | ||
1908 | { | 853 | { |
1909 | break; | 854 | ppgs.update(); |
855 | |||
856 | std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\."); | ||
857 | std::smatch relation_data; | ||
858 | if (!std::regex_search(line, relation_data, relation)) | ||
859 | { | ||
860 | continue; | ||
861 | } | ||
862 | |||
863 | int lookup1 = std::stoi(relation_data[1]); | ||
864 | int lookup2 = std::stoi(relation_data[2]); | ||
865 | |||
866 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
867 | { | ||
868 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
869 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
870 | |||
871 | std::list<field> fields; | ||
872 | fields.emplace_back("hyponym_id", notion1.getId()); | ||
873 | fields.emplace_back("hypernym_id", notion2.getId()); | ||
874 | |||
875 | db_.insertIntoTable("hypernymy", std::move(fields)); | ||
876 | } | ||
1910 | } | 877 | } |
878 | } | ||
1911 | 879 | ||
1912 | if (line.back() == '\r') | 880 | void generator::readWordNetInstantiation() |
881 | { | ||
882 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl")); | ||
883 | progress ppgs("Writing instantiation...", lines.size()); | ||
884 | for (auto line : lines) | ||
1913 | { | 885 | { |
1914 | line.pop_back(); | 886 | ppgs.update(); |
1915 | } | ||
1916 | 887 | ||
1917 | lines.push_back(line); | 888 | std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); |
889 | std::smatch relation_data; | ||
890 | if (!std::regex_search(line, relation_data, relation)) | ||
891 | { | ||
892 | continue; | ||
893 | } | ||
894 | |||
895 | int lookup1 = std::stoi(relation_data[1]); | ||
896 | int lookup2 = std::stoi(relation_data[2]); | ||
897 | |||
898 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
899 | { | ||
900 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
901 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
902 | |||
903 | std::list<field> fields; | ||
904 | fields.emplace_back("instance_id", notion1.getId()); | ||
905 | fields.emplace_back("class_id", notion2.getId()); | ||
906 | |||
907 | db_.insertIntoTable("instantiation", std::move(fields)); | ||
908 | } | ||
909 | } | ||
1918 | } | 910 | } |
1919 | 911 | ||
1920 | progress ppgs("Writing substance meronyms...", lines.size()); | 912 | void generator::readWordNetMemberMeronymy() |
1921 | for (auto line : lines) | ||
1922 | { | 913 | { |
1923 | ppgs.update(); | 914 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl")); |
1924 | 915 | progress ppgs("Writing member meronymy...", lines.size()); | |
1925 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); | 916 | for (auto line : lines) |
1926 | std::smatch relation_data; | ||
1927 | if (!std::regex_search(line, relation_data, relation)) | ||
1928 | { | 917 | { |
1929 | continue; | 918 | ppgs.update(); |
1930 | } | ||
1931 | |||
1932 | int synset_id_1 = stoi(relation_data[1]); | ||
1933 | int synset_id_2 = stoi(relation_data[2]); | ||
1934 | std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
1935 | 919 | ||
1936 | for (auto mapping1 : wn[synset_id_1]) | 920 | std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); |
1937 | { | 921 | std::smatch relation_data; |
1938 | for (auto mapping2 : wn[synset_id_2]) | 922 | if (!std::regex_search(line, relation_data, relation)) |
1939 | { | 923 | { |
1940 | sqlite3_stmt* ppstmt; | 924 | continue; |
1941 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 925 | } |
1942 | { | 926 | |
1943 | db_error(ppdb, query); | 927 | int lookup1 = std::stoi(relation_data[1]); |
1944 | } | 928 | int lookup2 = std::stoi(relation_data[2]); |
929 | |||
930 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
931 | { | ||
932 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
933 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
1945 | 934 | ||
1946 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 935 | std::list<field> fields; |
1947 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 936 | fields.emplace_back("holonym_id", notion1.getId()); |
937 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1948 | 938 | ||
1949 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 939 | db_.insertIntoTable("member_meronymy", std::move(fields)); |
1950 | { | ||
1951 | db_error(ppdb, query); | ||
1952 | } | ||
1953 | |||
1954 | sqlite3_finalize(ppstmt); | ||
1955 | } | 940 | } |
1956 | } | 941 | } |
1957 | } | 942 | } |
1958 | } | 943 | |
1959 | 944 | void generator::readWordNetPartMeronymy() | |
1960 | // mm table | ||
1961 | { | ||
1962 | std::ifstream wnmpfile(wnpref + "wn_mp.pl"); | ||
1963 | if (!wnmpfile.is_open()) | ||
1964 | { | ||
1965 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
1966 | print_usage(); | ||
1967 | } | ||
1968 | |||
1969 | std::list<std::string> lines; | ||
1970 | for (;;) | ||
1971 | { | 945 | { |
1972 | std::string line; | 946 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl")); |
1973 | if (!getline(wnmpfile, line)) | 947 | progress ppgs("Writing part meronymy...", lines.size()); |
948 | for (auto line : lines) | ||
1974 | { | 949 | { |
1975 | break; | 950 | ppgs.update(); |
951 | |||
952 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | ||
953 | std::smatch relation_data; | ||
954 | if (!std::regex_search(line, relation_data, relation)) | ||
955 | { | ||
956 | continue; | ||
957 | } | ||
958 | |||
959 | int lookup1 = std::stoi(relation_data[1]); | ||
960 | int lookup2 = std::stoi(relation_data[2]); | ||
961 | |||
962 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
963 | { | ||
964 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
965 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
966 | |||
967 | std::list<field> fields; | ||
968 | fields.emplace_back("holonym_id", notion1.getId()); | ||
969 | fields.emplace_back("meronym_id", notion2.getId()); | ||
970 | |||
971 | db_.insertIntoTable("part_meronymy", std::move(fields)); | ||
972 | } | ||
1976 | } | 973 | } |
974 | } | ||
1977 | 975 | ||
1978 | if (line.back() == '\r') | 976 | void generator::readWordNetSubstanceMeronymy() |
977 | { | ||
978 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl")); | ||
979 | progress ppgs("Writing substance meronymy...", lines.size()); | ||
980 | for (auto line : lines) | ||
1979 | { | 981 | { |
1980 | line.pop_back(); | 982 | ppgs.update(); |
1981 | } | ||
1982 | 983 | ||
1983 | lines.push_back(line); | 984 | std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); |
985 | std::smatch relation_data; | ||
986 | if (!std::regex_search(line, relation_data, relation)) | ||
987 | { | ||
988 | continue; | ||
989 | } | ||
990 | |||
991 | int lookup1 = std::stoi(relation_data[1]); | ||
992 | int lookup2 = std::stoi(relation_data[2]); | ||
993 | |||
994 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) | ||
995 | { | ||
996 | notion& notion1 = *notionByWnid_.at(lookup1); | ||
997 | notion& notion2 = *notionByWnid_.at(lookup2); | ||
998 | |||
999 | std::list<field> fields; | ||
1000 | fields.emplace_back("holonym_id", notion1.getId()); | ||
1001 | fields.emplace_back("meronym_id", notion2.getId()); | ||
1002 | |||
1003 | db_.insertIntoTable("substance_meronymy", std::move(fields)); | ||
1004 | } | ||
1005 | } | ||
1984 | } | 1006 | } |
1985 | 1007 | ||
1986 | progress ppgs("Writing part meronyms...", lines.size()); | 1008 | void generator::readWordNetPertainymy() |
1987 | for (auto line : lines) | ||
1988 | { | 1009 | { |
1989 | ppgs.update(); | 1010 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); |
1990 | 1011 | progress ppgs("Writing pertainymy and mannernymy...", lines.size()); | |
1991 | std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); | 1012 | for (auto line : lines) |
1992 | std::smatch relation_data; | ||
1993 | if (!std::regex_search(line, relation_data, relation)) | ||
1994 | { | 1013 | { |
1995 | continue; | 1014 | ppgs.update(); |
1996 | } | ||
1997 | |||
1998 | int synset_id_1 = stoi(relation_data[1]); | ||
1999 | int synset_id_2 = stoi(relation_data[2]); | ||
2000 | std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); | ||
2001 | 1015 | ||
2002 | for (auto mapping1 : wn[synset_id_1]) | 1016 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); |
2003 | { | 1017 | std::smatch relation_data; |
2004 | for (auto mapping2 : wn[synset_id_2]) | 1018 | if (!std::regex_search(line, relation_data, relation)) |
2005 | { | 1019 | { |
2006 | sqlite3_stmt* ppstmt; | 1020 | continue; |
2007 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | 1021 | } |
2008 | { | 1022 | |
2009 | db_error(ppdb, query); | 1023 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); |
2010 | } | 1024 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); |
1025 | |||
1026 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1027 | { | ||
1028 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1029 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
2011 | 1030 | ||
2012 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1031 | if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective) |
2013 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1032 | { |
1033 | std::list<field> fields; | ||
1034 | fields.emplace_back("pertainym_id", word1.getId()); | ||
1035 | fields.emplace_back("noun_id", word2.getId()); | ||
2014 | 1036 | ||
2015 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1037 | db_.insertIntoTable("pertainymy", std::move(fields)); |
1038 | } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb) | ||
2016 | { | 1039 | { |
2017 | db_error(ppdb, query); | 1040 | std::list<field> fields; |
2018 | } | 1041 | fields.emplace_back("mannernym_id", word1.getId()); |
1042 | fields.emplace_back("adjective_id", word2.getId()); | ||
2019 | 1043 | ||
2020 | sqlite3_finalize(ppstmt); | 1044 | db_.insertIntoTable("mannernymy", std::move(fields)); |
1045 | } | ||
2021 | } | 1046 | } |
2022 | } | 1047 | } |
2023 | } | 1048 | } |
2024 | } | ||
2025 | |||
2026 | // per table | ||
2027 | { | ||
2028 | std::ifstream wnperfile(wnpref + "wn_per.pl"); | ||
2029 | if (!wnperfile.is_open()) | ||
2030 | { | ||
2031 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2032 | print_usage(); | ||
2033 | } | ||
2034 | |||
2035 | std::list<std::string> lines; | ||
2036 | for (;;) | ||
2037 | { | ||
2038 | std::string line; | ||
2039 | if (!getline(wnperfile, line)) | ||
2040 | { | ||
2041 | break; | ||
2042 | } | ||
2043 | 1049 | ||
2044 | if (line.back() == '\r') | 1050 | void generator::readWordNetSpecification() |
1051 | { | ||
1052 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl")); | ||
1053 | progress ppgs("Writing specifications...", lines.size()); | ||
1054 | for (auto line : lines) | ||
2045 | { | 1055 | { |
2046 | line.pop_back(); | 1056 | ppgs.update(); |
1057 | |||
1058 | std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\."); | ||
1059 | std::smatch relation_data; | ||
1060 | if (!std::regex_search(line, relation_data, relation)) | ||
1061 | { | ||
1062 | continue; | ||
1063 | } | ||
1064 | |||
1065 | std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); | ||
1066 | std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); | ||
1067 | |||
1068 | if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2)) | ||
1069 | { | ||
1070 | word& word1 = *wordByWnidAndWnum_.at(lookup1); | ||
1071 | word& word2 = *wordByWnidAndWnum_.at(lookup2); | ||
1072 | |||
1073 | std::list<field> fields; | ||
1074 | fields.emplace_back("general_id", word1.getId()); | ||
1075 | fields.emplace_back("specific_id", word2.getId()); | ||
1076 | |||
1077 | db_.insertIntoTable("specification", std::move(fields)); | ||
1078 | } | ||
2047 | } | 1079 | } |
2048 | |||
2049 | lines.push_back(line); | ||
2050 | } | 1080 | } |
2051 | 1081 | ||
2052 | progress ppgs("Writing pertainyms and mannernyms...", lines.size()); | 1082 | void generator::readWordNetSimilarity() |
2053 | for (auto line : lines) | ||
2054 | { | 1083 | { |
2055 | ppgs.update(); | 1084 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl")); |
2056 | 1085 | progress ppgs("Writing adjective similarity...", lines.size()); | |
2057 | std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); | 1086 | for (auto line : lines) |
2058 | std::smatch relation_data; | ||
2059 | if (!std::regex_search(line, relation_data, relation)) | ||
2060 | { | 1087 | { |
2061 | continue; | 1088 | ppgs.update(); |
2062 | } | ||
2063 | 1089 | ||
2064 | int synset_id_1 = stoi(relation_data[1]); | 1090 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); |
2065 | int wnum_1 = stoi(relation_data[2]); | 1091 | std::smatch relation_data; |
2066 | int synset_id_2 = stoi(relation_data[3]); | 1092 | if (!std::regex_search(line, relation_data, relation)) |
2067 | int wnum_2 = stoi(relation_data[4]); | ||
2068 | std::string query; | ||
2069 | switch (synset_id_1 / 100000000) | ||
2070 | { | ||
2071 | case 3: // Adjective | ||
2072 | { | 1093 | { |
2073 | // This is a pertainym, the second word should be a noun | 1094 | continue; |
2074 | // Technically it can be an adjective but we're ignoring that | ||
2075 | if (synset_id_2 / 100000000 != 1) | ||
2076 | { | ||
2077 | continue; | ||
2078 | } | ||
2079 | |||
2080 | query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)"; | ||
2081 | |||
2082 | break; | ||
2083 | } | 1095 | } |
1096 | |||
1097 | int lookup1 = std::stoi(relation_data[1]); | ||
1098 | int lookup2 = std::stoi(relation_data[2]); | ||
2084 | 1099 | ||
2085 | case 4: // Adverb | 1100 | if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2)) |
2086 | { | 1101 | { |
2087 | // This is a mannernym, the second word should be an adjective | 1102 | notion& notion1 = *notionByWnid_.at(lookup1); |
2088 | if (synset_id_2 / 100000000 != 3) | 1103 | notion& notion2 = *notionByWnid_.at(lookup2); |
2089 | { | ||
2090 | continue; | ||
2091 | } | ||
2092 | 1104 | ||
2093 | query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; | 1105 | std::list<field> fields; |
1106 | fields.emplace_back("adjective_1_id", notion1.getId()); | ||
1107 | fields.emplace_back("adjective_2_id", notion2.getId()); | ||
2094 | 1108 | ||
2095 | break; | 1109 | db_.insertIntoTable("similarity", std::move(fields)); |
2096 | } | 1110 | } |
2097 | } | 1111 | } |
2098 | 1112 | } | |
2099 | sqlite3_stmt* ppstmt; | ||
2100 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2101 | { | ||
2102 | db_error(ppdb, query); | ||
2103 | } | ||
2104 | |||
2105 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | ||
2106 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | ||
2107 | 1113 | ||
2108 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1114 | std::list<std::string> generator::readFile(std::string path) |
1115 | { | ||
1116 | std::ifstream file(path); | ||
1117 | if (!file) | ||
2109 | { | 1118 | { |
2110 | db_error(ppdb, query); | 1119 | throw std::invalid_argument("Could not find file " + path); |
2111 | } | 1120 | } |
2112 | |||
2113 | sqlite3_finalize(ppstmt); | ||
2114 | } | ||
2115 | } | ||
2116 | 1121 | ||
2117 | // sa table | 1122 | std::list<std::string> lines; |
2118 | { | ||
2119 | std::ifstream wnsafile(wnpref + "wn_sa.pl"); | ||
2120 | if (!wnsafile.is_open()) | ||
2121 | { | ||
2122 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2123 | print_usage(); | ||
2124 | } | ||
2125 | |||
2126 | std::list<std::string> lines; | ||
2127 | for (;;) | ||
2128 | { | ||
2129 | std::string line; | 1123 | std::string line; |
2130 | if (!getline(wnsafile, line)) | 1124 | while (std::getline(file, line)) |
2131 | { | ||
2132 | break; | ||
2133 | } | ||
2134 | |||
2135 | if (line.back() == '\r') | ||
2136 | { | 1125 | { |
2137 | line.pop_back(); | 1126 | if (line.back() == '\r') |
1127 | { | ||
1128 | line.pop_back(); | ||
1129 | } | ||
1130 | |||
1131 | lines.push_back(line); | ||
2138 | } | 1132 | } |
2139 | 1133 | ||
2140 | lines.push_back(line); | 1134 | return lines; |
2141 | } | 1135 | } |
2142 | 1136 | ||
2143 | progress ppgs("Writing specifications...", lines.size()); | 1137 | part_of_speech generator::partOfSpeechByWnid(int wnid) |
2144 | for (auto line : lines) | ||
2145 | { | 1138 | { |
2146 | ppgs.update(); | 1139 | switch (wnid / 100000000) |
2147 | |||
2148 | std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\."); | ||
2149 | std::smatch relation_data; | ||
2150 | if (!std::regex_search(line, relation_data, relation)) | ||
2151 | { | ||
2152 | continue; | ||
2153 | } | ||
2154 | |||
2155 | int synset_id_1 = stoi(relation_data[1]); | ||
2156 | int wnum_1 = stoi(relation_data[2]); | ||
2157 | int synset_id_2 = stoi(relation_data[3]); | ||
2158 | int wnum_2 = stoi(relation_data[4]); | ||
2159 | std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)"); | ||
2160 | |||
2161 | sqlite3_stmt* ppstmt; | ||
2162 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) | ||
2163 | { | 1140 | { |
2164 | db_error(ppdb, query); | 1141 | case 1: return part_of_speech::noun; |
1142 | case 2: return part_of_speech::verb; | ||
1143 | case 3: return part_of_speech::adjective; | ||
1144 | case 4: return part_of_speech::adverb; | ||
1145 | default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid)); | ||
2165 | } | 1146 | } |
1147 | } | ||
2166 | 1148 | ||
2167 | sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); | 1149 | notion& generator::createNotion(part_of_speech partOfSpeech) |
2168 | sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); | 1150 | { |
1151 | notions_.emplace_back(partOfSpeech); | ||
1152 | |||
1153 | return notions_.back(); | ||
1154 | } | ||
2169 | 1155 | ||
2170 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1156 | notion& generator::lookupOrCreateNotion(int wnid) |
1157 | { | ||
1158 | if (!notionByWnid_.count(wnid)) | ||
2171 | { | 1159 | { |
2172 | db_error(ppdb, query); | 1160 | notions_.emplace_back(partOfSpeechByWnid(wnid), wnid); |
1161 | notionByWnid_[wnid] = ¬ions_.back(); | ||
2173 | } | 1162 | } |
2174 | 1163 | ||
2175 | sqlite3_finalize(ppstmt); | 1164 | return *notionByWnid_.at(wnid); |
2176 | } | ||
2177 | } | ||
2178 | |||
2179 | // sim table | ||
2180 | { | ||
2181 | std::ifstream wnsimfile(wnpref + "wn_sim.pl"); | ||
2182 | if (!wnsimfile.is_open()) | ||
2183 | { | ||
2184 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2185 | print_usage(); | ||
2186 | } | 1165 | } |
2187 | 1166 | ||
2188 | std::list<std::string> lines; | 1167 | lemma& generator::lookupOrCreateLemma(std::string base_form) |
2189 | for (;;) | ||
2190 | { | 1168 | { |
2191 | std::string line; | 1169 | if (!lemmaByBaseForm_.count(base_form)) |
2192 | if (!getline(wnsimfile, line)) | ||
2193 | { | 1170 | { |
2194 | break; | 1171 | lemmas_.emplace_back(lookupOrCreateForm(base_form)); |
1172 | lemmaByBaseForm_[base_form] = &lemmas_.back(); | ||
2195 | } | 1173 | } |
1174 | |||
1175 | return *lemmaByBaseForm_.at(base_form); | ||
1176 | } | ||
2196 | 1177 | ||
2197 | if (line.back() == '\r') | 1178 | form& generator::lookupOrCreateForm(std::string text) |
1179 | { | ||
1180 | if (!formByText_.count(text)) | ||
2198 | { | 1181 | { |
2199 | line.pop_back(); | 1182 | forms_.emplace_back(text); |
1183 | formByText_[text] = &forms_.back(); | ||
2200 | } | 1184 | } |
2201 | 1185 | ||
2202 | lines.push_back(line); | 1186 | return *formByText_[text]; |
2203 | } | 1187 | } |
2204 | 1188 | ||
2205 | progress ppgs("Writing sense synonyms...", lines.size()); | 1189 | template <typename... Args> word& generator::createWord(Args&&... args) |
2206 | for (auto line : lines) | ||
2207 | { | 1190 | { |
2208 | ppgs.update(); | 1191 | words_.emplace_back(std::forward<Args>(args)...); |
1192 | word& w = words_.back(); | ||
2209 | 1193 | ||
2210 | std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); | 1194 | wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w); |
2211 | std::smatch relation_data; | 1195 | |
2212 | if (!std::regex_search(line, relation_data, relation)) | 1196 | if (w.getNotion().hasWnid()) |
2213 | { | 1197 | { |
2214 | continue; | 1198 | wordsByWnid_[w.getNotion().getWnid()].insert(&w); |
2215 | } | 1199 | } |
2216 | 1200 | ||
2217 | int synset_id_1 = stoi(relation_data[1]); | 1201 | return w; |
2218 | int synset_id_2 = stoi(relation_data[2]); | 1202 | } |
2219 | std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); | 1203 | |
1204 | group& generator::createGroup(xmlNodePtr top) | ||
1205 | { | ||
1206 | groups_.emplace_back(); | ||
1207 | group& grp = groups_.back(); | ||
2220 | 1208 | ||
2221 | for (auto mapping1 : wn[synset_id_1]) | 1209 | xmlChar* key; |
1210 | |||
1211 | for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next) | ||
2222 | { | 1212 | { |
2223 | for (auto mapping2 : wn[synset_id_2]) | 1213 | if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES"))) |
2224 | { | 1214 | { |
2225 | sqlite3_stmt* ppstmt; | 1215 | for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next) |
2226 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2227 | { | 1216 | { |
2228 | db_error(ppdb, query); | 1217 | if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS"))) |
1218 | { | ||
1219 | try | ||
1220 | { | ||
1221 | group& subgrp = createGroup(subclass); | ||
1222 | subgrp.setParent(grp); | ||
1223 | } catch (const std::exception& e) | ||
1224 | { | ||
1225 | key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID")); | ||
1226 | |||
1227 | if (key == nullptr) | ||
1228 | { | ||
1229 | std::throw_with_nested(std::logic_error("Error parsing IDless subgroup")); | ||
1230 | } else { | ||
1231 | std::string subgroupId(reinterpret_cast<const char*>(key)); | ||
1232 | xmlFree(key); | ||
1233 | |||
1234 | std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId)); | ||
1235 | } | ||
1236 | } | ||
1237 | } | ||
2229 | } | 1238 | } |
2230 | 1239 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS"))) | |
2231 | sqlite3_bind_int(ppstmt, 1, mapping1.second); | 1240 | { |
2232 | sqlite3_bind_int(ppstmt, 2, mapping2.second); | 1241 | for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) |
2233 | |||
2234 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2235 | { | 1242 | { |
2236 | db_error(ppdb, query); | 1243 | if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER"))) |
1244 | { | ||
1245 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn")); | ||
1246 | std::string wnSenses(reinterpret_cast<const char*>(key)); | ||
1247 | xmlFree(key); | ||
1248 | |||
1249 | auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " "); | ||
1250 | if (!wnSenseKeys.empty()) | ||
1251 | { | ||
1252 | std::list<std::string> tempKeys; | ||
1253 | |||
1254 | std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) { | ||
1255 | return sense + "::"; | ||
1256 | }); | ||
1257 | |||
1258 | std::list<std::string> filteredKeys; | ||
1259 | |||
1260 | std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) { | ||
1261 | return !wnSenseKeys_.count(sense); | ||
1262 | }); | ||
1263 | |||
1264 | wnSenseKeys = std::move(filteredKeys); | ||
1265 | } | ||
1266 | |||
1267 | if (!wnSenseKeys.empty()) | ||
1268 | { | ||
1269 | for (std::string sense : wnSenseKeys) | ||
1270 | { | ||
1271 | word& wordSense = *wnSenseKeys_[sense]; | ||
1272 | wordSense.setVerbGroup(grp); | ||
1273 | } | ||
1274 | } else { | ||
1275 | key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name")); | ||
1276 | std::string memberName(reinterpret_cast<const char*>(key)); | ||
1277 | xmlFree(key); | ||
1278 | |||
1279 | notion& n = createNotion(part_of_speech::verb); | ||
1280 | lemma& l = lookupOrCreateLemma(memberName); | ||
1281 | word& w = createWord(n, l); | ||
1282 | |||
1283 | w.setVerbGroup(grp); | ||
1284 | } | ||
1285 | } | ||
2237 | } | 1286 | } |
2238 | 1287 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES"))) | |
2239 | sqlite3_reset(ppstmt); | 1288 | { |
2240 | sqlite3_clear_bindings(ppstmt); | 1289 | for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next) |
2241 | |||
2242 | sqlite3_bind_int(ppstmt, 1, mapping2.second); | ||
2243 | sqlite3_bind_int(ppstmt, 2, mapping1.second); | ||
2244 | |||
2245 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | ||
2246 | { | 1290 | { |
2247 | db_error(ppdb, query); | 1291 | if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE"))) |
1292 | { | ||
1293 | role r; | ||
1294 | |||
1295 | key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type")); | ||
1296 | std::string roleName = reinterpret_cast<const char*>(key); | ||
1297 | xmlFree(key); | ||
1298 | |||
1299 | for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next) | ||
1300 | { | ||
1301 | if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1302 | { | ||
1303 | r.setSelrestrs(parseSelrestr(rolenode)); | ||
1304 | } | ||
1305 | } | ||
1306 | |||
1307 | grp.addRole(roleName, std::move(r)); | ||
1308 | } | ||
2248 | } | 1309 | } |
1310 | } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES"))) | ||
1311 | { | ||
1312 | for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next) | ||
1313 | { | ||
1314 | if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME"))) | ||
1315 | { | ||
1316 | frames_.emplace_back(); | ||
1317 | frame& fr = frames_.back(); | ||
2249 | 1318 | ||
2250 | sqlite3_finalize(ppstmt); | 1319 | for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) |
1320 | { | ||
1321 | if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX"))) | ||
1322 | { | ||
1323 | for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next) | ||
1324 | { | ||
1325 | if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP"))) | ||
1326 | { | ||
1327 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1328 | std::string partRole = reinterpret_cast<const char*>(key); | ||
1329 | xmlFree(key); | ||
1330 | |||
1331 | selrestr partSelrestrs; | ||
1332 | std::set<std::string> partSynrestrs; | ||
1333 | |||
1334 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1335 | { | ||
1336 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS"))) | ||
1337 | { | ||
1338 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1339 | { | ||
1340 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR"))) | ||
1341 | { | ||
1342 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1343 | partSynrestrs.insert(reinterpret_cast<const char*>(key)); | ||
1344 | xmlFree(key); | ||
1345 | } | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1350 | { | ||
1351 | partSelrestrs = parseSelrestr(npnode); | ||
1352 | } | ||
1353 | } | ||
1354 | |||
1355 | fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs))); | ||
1356 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB"))) | ||
1357 | { | ||
1358 | fr.push_back(part::createVerb()); | ||
1359 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP"))) | ||
1360 | { | ||
1361 | std::set<std::string> partChoices; | ||
1362 | bool partLiteral; | ||
1363 | |||
1364 | if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"))) | ||
1365 | { | ||
1366 | partLiteral = true; | ||
1367 | |||
1368 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1369 | std::string choicesStr = reinterpret_cast<const char*>(key); | ||
1370 | xmlFree(key); | ||
1371 | |||
1372 | split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices))); | ||
1373 | } else { | ||
1374 | partLiteral = false; | ||
1375 | |||
1376 | for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next) | ||
1377 | { | ||
1378 | if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) | ||
1379 | { | ||
1380 | for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next) | ||
1381 | { | ||
1382 | if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1383 | { | ||
1384 | key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type")); | ||
1385 | partChoices.insert(reinterpret_cast<const char*>(key)); | ||
1386 | xmlFree(key); | ||
1387 | } | ||
1388 | } | ||
1389 | } | ||
1390 | } | ||
1391 | } | ||
1392 | |||
1393 | fr.push_back(part::createPreposition(std::move(partChoices), partLiteral)); | ||
1394 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ"))) | ||
1395 | { | ||
1396 | fr.push_back(part::createAdjective()); | ||
1397 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV"))) | ||
1398 | { | ||
1399 | fr.push_back(part::createAdverb()); | ||
1400 | } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX"))) | ||
1401 | { | ||
1402 | key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")); | ||
1403 | std::string literalValue = reinterpret_cast<const char*>(key); | ||
1404 | xmlFree(key); | ||
1405 | |||
1406 | fr.push_back(part::createLiteral(literalValue)); | ||
1407 | } else { | ||
1408 | continue; | ||
1409 | } | ||
1410 | } | ||
1411 | |||
1412 | grp.addFrame(fr); | ||
1413 | } | ||
1414 | } | ||
1415 | } | ||
1416 | } | ||
2251 | } | 1417 | } |
2252 | } | 1418 | } |
2253 | } | ||
2254 | } | ||
2255 | |||
2256 | // syntax table | ||
2257 | { | ||
2258 | std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl"); | ||
2259 | if (!wnsyntaxfile.is_open()) | ||
2260 | { | ||
2261 | std::cout << "Invalid WordNet data directory." << std::endl; | ||
2262 | print_usage(); | ||
2263 | } | ||
2264 | 1419 | ||
2265 | std::list<std::string> lines; | 1420 | return grp; |
2266 | for (;;) | ||
2267 | { | ||
2268 | std::string line; | ||
2269 | if (!getline(wnsyntaxfile, line)) | ||
2270 | { | ||
2271 | break; | ||
2272 | } | ||
2273 | |||
2274 | if (line.back() == '\r') | ||
2275 | { | ||
2276 | line.pop_back(); | ||
2277 | } | ||
2278 | |||
2279 | lines.push_back(line); | ||
2280 | } | 1421 | } |
2281 | 1422 | ||
2282 | progress ppgs("Writing adjective syntax markers...", lines.size()); | 1423 | selrestr generator::parseSelrestr(xmlNodePtr top) |
2283 | for (auto line : lines) | ||
2284 | { | 1424 | { |
2285 | ppgs.update(); | 1425 | xmlChar* key; |
2286 | 1426 | ||
2287 | std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); | 1427 | if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))) |
2288 | std::smatch relation_data; | ||
2289 | if (!std::regex_search(line, relation_data, relation)) | ||
2290 | { | ||
2291 | continue; | ||
2292 | } | ||
2293 | |||
2294 | int synset_id = stoi(relation_data[1]); | ||
2295 | int wnum = stoi(relation_data[2]); | ||
2296 | std::string syn = relation_data[3]; | ||
2297 | std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?"); | ||
2298 | |||
2299 | sqlite3_stmt* ppstmt; | ||
2300 | if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) | ||
2301 | { | 1428 | { |
2302 | db_error(ppdb, query); | 1429 | if (xmlChildElementCount(top) == 0) |
2303 | } | 1430 | { |
2304 | 1431 | return {}; | |
2305 | sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); | 1432 | } else if (xmlChildElementCount(top) == 1) |
2306 | sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); | 1433 | { |
2307 | 1434 | return parseSelrestr(xmlFirstElementChild(top)); | |
2308 | if (sqlite3_step(ppstmt) != SQLITE_DONE) | 1435 | } else { |
1436 | bool orlogic = false; | ||
1437 | if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic"))) | ||
1438 | { | ||
1439 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic")); | ||
1440 | if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or"))) | ||
1441 | { | ||
1442 | orlogic = true; | ||
1443 | } | ||
1444 | |||
1445 | xmlFree(key); | ||
1446 | } | ||
1447 | |||
1448 | std::list<selrestr> children; | ||
1449 | for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) | ||
1450 | { | ||
1451 | if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS")) | ||
1452 | || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
1453 | { | ||
1454 | children.push_back(parseSelrestr(selrestr)); | ||
1455 | } | ||
1456 | } | ||
1457 | |||
1458 | return selrestr(children, orlogic); | ||
1459 | } | ||
1460 | } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR"))) | ||
2309 | { | 1461 | { |
2310 | db_error(ppdb, query); | 1462 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value")); |
1463 | bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+"); | ||
1464 | xmlFree(key); | ||
1465 | |||
1466 | key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type")); | ||
1467 | std::string selRestriction = reinterpret_cast<const char*>(key); | ||
1468 | xmlFree(key); | ||
1469 | |||
1470 | return selrestr(selRestriction, selPos); | ||
1471 | } else { | ||
1472 | throw std::logic_error("Badly formatted selrestr"); | ||
2311 | } | 1473 | } |
2312 | |||
2313 | sqlite3_finalize(ppstmt); | ||
2314 | } | 1474 | } |
2315 | } | 1475 | |
2316 | 1476 | }; | |
2317 | sqlite3_close_v2(ppdb); | 1477 | }; |
2318 | |||
2319 | std::cout << "Done." << std::endl; | ||
2320 | } | ||
diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h | |||
@@ -0,0 +1,151 @@ | |||
1 | #ifndef GENERATOR_H_5B61CBC5 | ||
2 | #define GENERATOR_H_5B61CBC5 | ||
3 | |||
4 | #include <string> | ||
5 | #include <map> | ||
6 | #include <list> | ||
7 | #include <set> | ||
8 | #include <libxml/parser.h> | ||
9 | #include "database.h" | ||
10 | #include "notion.h" | ||
11 | #include "word.h" | ||
12 | #include "lemma.h" | ||
13 | #include "form.h" | ||
14 | #include "pronunciation.h" | ||
15 | #include "group.h" | ||
16 | #include "frame.h" | ||
17 | |||
18 | namespace verbly { | ||
19 | namespace generator { | ||
20 | |||
21 | enum class part_of_speech; | ||
22 | class selrestr; | ||
23 | |||
24 | class generator { | ||
25 | public: | ||
26 | |||
27 | // Constructor | ||
28 | |||
29 | generator( | ||
30 | std::string verbNetPath, | ||
31 | std::string agidPath, | ||
32 | std::string wordNetPath, | ||
33 | std::string cmudictPath, | ||
34 | std::string imageNetPath, | ||
35 | std::string outputPath); | ||
36 | |||
37 | // Action | ||
38 | |||
39 | void run(); | ||
40 | |||
41 | private: | ||
42 | |||
43 | // Subroutines | ||
44 | |||
45 | void readWordNetSynsets(); | ||
46 | |||
47 | void readAdjectivePositioning(); | ||
48 | |||
49 | void readImageNetUrls(); | ||
50 | |||
51 | void readWordNetSenseKeys(); | ||
52 | |||
53 | void readVerbNet(); | ||
54 | |||
55 | void readAgidInflections(); | ||
56 | |||
57 | void readPrepositions(); | ||
58 | |||
59 | void readCmudictPronunciations(); | ||
60 | |||
61 | void writeSchema(); | ||
62 | |||
63 | void dumpObjects(); | ||
64 | |||
65 | void readWordNetAntonymy(); | ||
66 | |||
67 | void readWordNetVariation(); | ||
68 | |||
69 | void readWordNetClasses(); | ||
70 | |||
71 | void readWordNetCausality(); | ||
72 | |||
73 | void readWordNetEntailment(); | ||
74 | |||
75 | void readWordNetHypernymy(); | ||
76 | |||
77 | void readWordNetInstantiation(); | ||
78 | |||
79 | void readWordNetMemberMeronymy(); | ||
80 | |||
81 | void readWordNetPartMeronymy(); | ||
82 | |||
83 | void readWordNetSubstanceMeronymy(); | ||
84 | |||
85 | void readWordNetPertainymy(); | ||
86 | |||
87 | void readWordNetSpecification(); | ||
88 | |||
89 | void readWordNetSimilarity(); | ||
90 | |||
91 | // Helpers | ||
92 | |||
93 | std::list<std::string> readFile(std::string path); | ||
94 | |||
95 | inline part_of_speech partOfSpeechByWnid(int wnid); | ||
96 | |||
97 | notion& createNotion(part_of_speech partOfSpeech); | ||
98 | |||
99 | notion& lookupOrCreateNotion(int wnid); | ||
100 | |||
101 | lemma& lookupOrCreateLemma(std::string base_form); | ||
102 | |||
103 | form& lookupOrCreateForm(std::string text); | ||
104 | |||
105 | template <typename... Args> word& createWord(Args&&... args); | ||
106 | |||
107 | group& createGroup(xmlNodePtr top); | ||
108 | |||
109 | selrestr parseSelrestr(xmlNodePtr top); | ||
110 | |||
111 | // Input | ||
112 | |||
113 | std::string verbNetPath_; | ||
114 | std::string agidPath_; | ||
115 | std::string wordNetPath_; | ||
116 | std::string cmudictPath_; | ||
117 | std::string imageNetPath_; | ||
118 | |||
119 | // Output | ||
120 | |||
121 | database db_; | ||
122 | |||
123 | // Data | ||
124 | |||
125 | std::list<notion> notions_; | ||
126 | std::list<word> words_; | ||
127 | std::list<lemma> lemmas_; | ||
128 | std::list<form> forms_; | ||
129 | std::list<pronunciation> pronunciations_; | ||
130 | std::list<frame> frames_; | ||
131 | std::list<group> groups_; | ||
132 | |||
133 | // Indexes | ||
134 | |||
135 | std::map<int, notion*> notionByWnid_; | ||
136 | std::map<int, std::set<word*>> wordsByWnid_; | ||
137 | std::map<std::pair<int, int>, word*> wordByWnidAndWnum_; | ||
138 | std::map<std::string, std::set<word*>> wordsByBaseForm_; | ||
139 | std::map<std::string, lemma*> lemmaByBaseForm_; | ||
140 | std::map<std::string, form*> formByText_; | ||
141 | |||
142 | // Caches | ||
143 | |||
144 | std::map<std::string, word*> wnSenseKeys_; | ||
145 | |||
146 | }; | ||
147 | |||
148 | }; | ||
149 | }; | ||
150 | |||
151 | #endif /* end of include guard: GENERATOR_H_5B61CBC5 */ | ||
diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp | |||
@@ -0,0 +1,119 @@ | |||
1 | #include "group.h" | ||
2 | #include <stdexcept> | ||
3 | #include <list> | ||
4 | #include <json.hpp> | ||
5 | #include "database.h" | ||
6 | #include "field.h" | ||
7 | #include "frame.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | int group::nextId_ = 0; | ||
13 | |||
14 | group::group() : id_(nextId_++) | ||
15 | { | ||
16 | } | ||
17 | |||
18 | void group::setParent(const group& parent) | ||
19 | { | ||
20 | // Adding a group to itself is nonsensical. | ||
21 | assert(&parent != this); | ||
22 | |||
23 | parent_ = &parent; | ||
24 | } | ||
25 | |||
26 | void group::addRole(std::string name, role r) | ||
27 | { | ||
28 | roleNames_.insert(name); | ||
29 | roles_[name] = std::move(r); | ||
30 | } | ||
31 | |||
32 | void group::addFrame(const frame& f) | ||
33 | { | ||
34 | frames_.insert(&f); | ||
35 | } | ||
36 | |||
37 | std::set<std::string> group::getRoles() const | ||
38 | { | ||
39 | std::set<std::string> fullRoles = roleNames_; | ||
40 | |||
41 | if (hasParent()) | ||
42 | { | ||
43 | for (std::string name : getParent().getRoles()) | ||
44 | { | ||
45 | fullRoles.insert(name); | ||
46 | } | ||
47 | } | ||
48 | |||
49 | return fullRoles; | ||
50 | } | ||
51 | |||
52 | const role& group::getRole(std::string name) const | ||
53 | { | ||
54 | if (roles_.count(name)) | ||
55 | { | ||
56 | return roles_.at(name); | ||
57 | } else if (hasParent()) | ||
58 | { | ||
59 | return getParent().getRole(name); | ||
60 | } else { | ||
61 | throw std::invalid_argument("Specified role not found in verb group"); | ||
62 | } | ||
63 | } | ||
64 | |||
65 | std::set<const frame*> group::getFrames() const | ||
66 | { | ||
67 | std::set<const frame*> fullFrames = frames_; | ||
68 | |||
69 | if (hasParent()) | ||
70 | { | ||
71 | for (const frame* f : getParent().getFrames()) | ||
72 | { | ||
73 | fullFrames.insert(f); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | return fullFrames; | ||
78 | } | ||
79 | |||
80 | database& operator<<(database& db, const group& arg) | ||
81 | { | ||
82 | // Serialize the group first | ||
83 | { | ||
84 | std::list<field> fields; | ||
85 | fields.emplace_back("group_id", arg.getId()); | ||
86 | |||
87 | nlohmann::json jsonRoles; | ||
88 | for (std::string name : arg.getRoles()) | ||
89 | { | ||
90 | const role& r = arg.getRole(name); | ||
91 | |||
92 | nlohmann::json jsonRole; | ||
93 | jsonRole["type"] = name; | ||
94 | jsonRole["selrestrs"] = r.getSelrestrs().toJson(); | ||
95 | |||
96 | jsonRoles.emplace_back(std::move(jsonRole)); | ||
97 | } | ||
98 | |||
99 | fields.emplace_back("data", jsonRoles.dump()); | ||
100 | |||
101 | db.insertIntoTable("groups", std::move(fields)); | ||
102 | } | ||
103 | |||
104 | // Then, serialize the group/frame relationship | ||
105 | for (const frame* f : arg.getFrames()) | ||
106 | { | ||
107 | std::list<field> fields; | ||
108 | |||
109 | fields.emplace_back("group_id", arg.getId()); | ||
110 | fields.emplace_back("frame_id", f->getId()); | ||
111 | |||
112 | db.insertIntoTable("groups_frames", std::move(fields)); | ||
113 | } | ||
114 | |||
115 | return db; | ||
116 | } | ||
117 | |||
118 | }; | ||
119 | }; | ||
diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h | |||
@@ -0,0 +1,80 @@ | |||
1 | #ifndef GROUP_H_EDAFB5DC | ||
2 | #define GROUP_H_EDAFB5DC | ||
3 | |||
4 | #include <map> | ||
5 | #include <set> | ||
6 | #include <string> | ||
7 | #include <cassert> | ||
8 | #include "role.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | class frame; | ||
14 | class database; | ||
15 | |||
16 | class group { | ||
17 | public: | ||
18 | |||
19 | // Constructor | ||
20 | |||
21 | group(); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void setParent(const group& parent); | ||
26 | |||
27 | void addRole(std::string name, role r); | ||
28 | |||
29 | void addFrame(const frame& f); | ||
30 | |||
31 | // Accessors | ||
32 | |||
33 | int getId() const | ||
34 | { | ||
35 | return id_; | ||
36 | } | ||
37 | |||
38 | bool hasParent() const | ||
39 | { | ||
40 | return (parent_ != nullptr); | ||
41 | } | ||
42 | |||
43 | const group& getParent() const | ||
44 | { | ||
45 | // Calling code should always call hasParent first | ||
46 | assert(parent_ != nullptr); | ||
47 | |||
48 | return *parent_; | ||
49 | } | ||
50 | |||
51 | std::set<std::string> getRoles() const; | ||
52 | |||
53 | const role& getRole(std::string name) const; | ||
54 | |||
55 | std::set<const frame*> getFrames() const; | ||
56 | |||
57 | private: | ||
58 | |||
59 | static int nextId_; | ||
60 | |||
61 | const int id_; | ||
62 | |||
63 | const group* parent_ = nullptr; | ||
64 | std::map<std::string, role> roles_; | ||
65 | std::set<const frame*> frames_; | ||
66 | |||
67 | // Caches | ||
68 | |||
69 | std::set<std::string> roleNames_; | ||
70 | |||
71 | }; | ||
72 | |||
73 | // Serializer | ||
74 | |||
75 | database& operator<<(database& db, const group& arg); | ||
76 | |||
77 | }; | ||
78 | }; | ||
79 | |||
80 | #endif /* end of include guard: GROUP_H_EDAFB5DC */ | ||
diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp | |||
@@ -0,0 +1,65 @@ | |||
1 | #include "lemma.h" | ||
2 | #include <list> | ||
3 | #include <cassert> | ||
4 | #include "field.h" | ||
5 | #include "database.h" | ||
6 | #include "form.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | int lemma::nextId_ = 0; | ||
12 | |||
13 | lemma::lemma(const form& baseForm) : | ||
14 | id_(nextId_++), | ||
15 | baseForm_(baseForm) | ||
16 | { | ||
17 | inflections_[inflection::base] = {&baseForm}; | ||
18 | } | ||
19 | |||
20 | void lemma::addInflection(inflection type, const form& f) | ||
21 | { | ||
22 | // There can only be one base form. | ||
23 | assert(type != inflection::base); | ||
24 | |||
25 | inflections_[type].insert(&f); | ||
26 | } | ||
27 | |||
28 | std::set<const form*> lemma::getInflections(inflection type) const | ||
29 | { | ||
30 | if (inflections_.count(type)) | ||
31 | { | ||
32 | return inflections_.at(type); | ||
33 | } else { | ||
34 | return {}; | ||
35 | } | ||
36 | } | ||
37 | |||
38 | database& operator<<(database& db, const lemma& arg) | ||
39 | { | ||
40 | for (inflection type : { | ||
41 | inflection::base, | ||
42 | inflection::plural, | ||
43 | inflection::comparative, | ||
44 | inflection::superlative, | ||
45 | inflection::past_tense, | ||
46 | inflection::past_participle, | ||
47 | inflection::ing_form, | ||
48 | inflection::s_form}) | ||
49 | { | ||
50 | for (const form* f : arg.getInflections(type)) | ||
51 | { | ||
52 | std::list<field> fields; | ||
53 | fields.emplace_back("lemma_id", arg.getId()); | ||
54 | fields.emplace_back("form_id", f->getId()); | ||
55 | fields.emplace_back("category", static_cast<int>(type)); | ||
56 | |||
57 | db.insertIntoTable("lemmas_forms", std::move(fields)); | ||
58 | } | ||
59 | } | ||
60 | |||
61 | return db; | ||
62 | } | ||
63 | |||
64 | }; | ||
65 | }; | ||
diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h | |||
@@ -0,0 +1,58 @@ | |||
1 | #ifndef LEMMA_H_D73105A7 | ||
2 | #define LEMMA_H_D73105A7 | ||
3 | |||
4 | #include <string> | ||
5 | #include <map> | ||
6 | #include <set> | ||
7 | #include "enums.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | class database; | ||
13 | class form; | ||
14 | |||
15 | class lemma { | ||
16 | public: | ||
17 | |||
18 | // Constructors | ||
19 | |||
20 | explicit lemma(const form& baseForm); | ||
21 | |||
22 | // Mutators | ||
23 | |||
24 | void addInflection(inflection type, const form& f); | ||
25 | |||
26 | // Accessors | ||
27 | |||
28 | int getId() const | ||
29 | { | ||
30 | return id_; | ||
31 | } | ||
32 | |||
33 | const form& getBaseForm() const | ||
34 | { | ||
35 | return baseForm_; | ||
36 | } | ||
37 | |||
38 | std::set<const form*> getInflections(inflection type) const; | ||
39 | |||
40 | private: | ||
41 | |||
42 | static int nextId_; | ||
43 | |||
44 | const int id_; | ||
45 | const form& baseForm_; | ||
46 | |||
47 | std::map<inflection, std::set<const form*>> inflections_; | ||
48 | |||
49 | }; | ||
50 | |||
51 | // Serializer | ||
52 | |||
53 | database& operator<<(database& db, const lemma& arg); | ||
54 | |||
55 | }; | ||
56 | }; | ||
57 | |||
58 | #endif /* end of include guard: LEMMA_H_D73105A7 */ | ||
diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp | |||
@@ -0,0 +1,40 @@ | |||
1 | #include <iostream> | ||
2 | #include <exception> | ||
3 | #include "generator.h" | ||
4 | |||
5 | void printUsage() | ||
6 | { | ||
7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl; | ||
8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; | ||
9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | ||
10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; | ||
11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | ||
12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; | ||
13 | std::cout << "output :: datafile output path" << std::endl; | ||
14 | } | ||
15 | |||
16 | int main(int argc, char** argv) | ||
17 | { | ||
18 | if (argc == 7) | ||
19 | { | ||
20 | try | ||
21 | { | ||
22 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); | ||
23 | |||
24 | try | ||
25 | { | ||
26 | app.run(); | ||
27 | } catch (const std::exception& e) | ||
28 | { | ||
29 | std::cout << e.what() << std::endl; | ||
30 | } | ||
31 | } catch (const std::exception& e) | ||
32 | { | ||
33 | std::cout << e.what() << std::endl; | ||
34 | printUsage(); | ||
35 | } | ||
36 | } else { | ||
37 | std::cout << "verbly datafile generator" << std::endl; | ||
38 | printUsage(); | ||
39 | } | ||
40 | } | ||
diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp | |||
@@ -0,0 +1,85 @@ | |||
1 | #include "notion.h" | ||
2 | #include <string> | ||
3 | #include <list> | ||
4 | #include "database.h" | ||
5 | #include "field.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | int notion::nextId_ = 0; | ||
11 | |||
12 | notion::notion( | ||
13 | part_of_speech partOfSpeech) : | ||
14 | id_(nextId_++), | ||
15 | partOfSpeech_(partOfSpeech) | ||
16 | { | ||
17 | } | ||
18 | |||
19 | notion::notion( | ||
20 | part_of_speech partOfSpeech, | ||
21 | int wnid) : | ||
22 | id_(nextId_++), | ||
23 | partOfSpeech_(partOfSpeech), | ||
24 | wnid_(wnid), | ||
25 | hasWnid_(true) | ||
26 | { | ||
27 | } | ||
28 | |||
29 | void notion::incrementNumOfImages() | ||
30 | { | ||
31 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
32 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
33 | |||
34 | numOfImages_++; | ||
35 | } | ||
36 | |||
37 | void notion::setPrepositionGroups(std::list<std::string> groups) | ||
38 | { | ||
39 | // Calling code should always check that the notion is a preposition first. | ||
40 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
41 | |||
42 | prepositionGroups_ = groups; | ||
43 | } | ||
44 | |||
45 | database& operator<<(database& db, const notion& arg) | ||
46 | { | ||
47 | // First, serialize the notion | ||
48 | { | ||
49 | std::list<field> fields; | ||
50 | |||
51 | fields.emplace_back("notion_id", arg.getId()); | ||
52 | fields.emplace_back("part_of_speech", static_cast<int>(arg.getPartOfSpeech())); | ||
53 | |||
54 | if (arg.hasWnid()) | ||
55 | { | ||
56 | fields.emplace_back("wnid", arg.getWnid()); | ||
57 | |||
58 | if (arg.getPartOfSpeech() == part_of_speech::noun) | ||
59 | { | ||
60 | fields.emplace_back("images", arg.getNumOfImages()); | ||
61 | } | ||
62 | } | ||
63 | |||
64 | db.insertIntoTable("notions", std::move(fields)); | ||
65 | } | ||
66 | |||
67 | // Next, serialize the is_a relationship if this is a preposition | ||
68 | if (arg.getPartOfSpeech() == part_of_speech::preposition) | ||
69 | { | ||
70 | for (std::string group : arg.getPrepositionGroups()) | ||
71 | { | ||
72 | std::list<field> fields; | ||
73 | |||
74 | fields.emplace_back("notion_id", arg.getId()); | ||
75 | fields.emplace_back("groupname", group); | ||
76 | |||
77 | db.insertIntoTable("is_a", std::move(fields)); | ||
78 | } | ||
79 | } | ||
80 | |||
81 | return db; | ||
82 | } | ||
83 | |||
84 | }; | ||
85 | }; | ||
diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h | |||
@@ -0,0 +1,91 @@ | |||
1 | #ifndef NOTION_H_221DE2BC | ||
2 | #define NOTION_H_221DE2BC | ||
3 | |||
4 | #include <cassert> | ||
5 | #include <list> | ||
6 | #include <string> | ||
7 | #include "enums.h" | ||
8 | |||
9 | namespace verbly { | ||
10 | namespace generator { | ||
11 | |||
12 | class database; | ||
13 | |||
14 | class notion { | ||
15 | public: | ||
16 | |||
17 | // Constructors | ||
18 | |||
19 | explicit notion(part_of_speech partOfSpeech); | ||
20 | |||
21 | notion(part_of_speech partOfSpeech, int wnid); | ||
22 | |||
23 | // Mutators | ||
24 | |||
25 | void incrementNumOfImages(); | ||
26 | |||
27 | void setPrepositionGroups(std::list<std::string> groups); | ||
28 | |||
29 | // Accessors | ||
30 | |||
31 | int getId() const | ||
32 | { | ||
33 | return id_; | ||
34 | } | ||
35 | |||
36 | part_of_speech getPartOfSpeech() const | ||
37 | { | ||
38 | return partOfSpeech_; | ||
39 | } | ||
40 | |||
41 | bool hasWnid() const | ||
42 | { | ||
43 | return hasWnid_; | ||
44 | } | ||
45 | |||
46 | int getWnid() const | ||
47 | { | ||
48 | // Calling code should always call hasWnid first. | ||
49 | assert(hasWnid_); | ||
50 | |||
51 | return wnid_; | ||
52 | } | ||
53 | |||
54 | int getNumOfImages() const | ||
55 | { | ||
56 | // Calling code should always call hasWnid and check that the notion is a noun first. | ||
57 | assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun)); | ||
58 | |||
59 | return numOfImages_; | ||
60 | } | ||
61 | |||
62 | std::list<std::string> getPrepositionGroups() const | ||
63 | { | ||
64 | // Calling code should always check that the notion is a preposition first. | ||
65 | assert(partOfSpeech_ == part_of_speech::preposition); | ||
66 | |||
67 | return prepositionGroups_; | ||
68 | } | ||
69 | |||
70 | private: | ||
71 | |||
72 | static int nextId_; | ||
73 | |||
74 | const int id_; | ||
75 | const part_of_speech partOfSpeech_; | ||
76 | const int wnid_ = 0; | ||
77 | const bool hasWnid_ = false; | ||
78 | |||
79 | int numOfImages_ = 0; | ||
80 | std::list<std::string> prepositionGroups_; | ||
81 | |||
82 | }; | ||
83 | |||
84 | // Serializer | ||
85 | |||
86 | database& operator<<(database& db, const notion& arg); | ||
87 | |||
88 | }; | ||
89 | }; | ||
90 | |||
91 | #endif /* end of include guard: NOTION_H_221DE2BC */ | ||
diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp | |||
@@ -0,0 +1,336 @@ | |||
1 | #include "part.h" | ||
2 | #include <stdexcept> | ||
3 | #include "selrestr.h" | ||
4 | |||
5 | namespace verbly { | ||
6 | namespace generator { | ||
7 | |||
8 | part part::createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs) | ||
9 | { | ||
10 | part p(type::noun_phrase); | ||
11 | |||
12 | new(&p.noun_phrase_.role) std::string(std::move(role)); | ||
13 | new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs)); | ||
14 | new(&p.noun_phrase_.synrestrs) std::set<std::string>(std::move(synrestrs)); | ||
15 | |||
16 | return p; | ||
17 | } | ||
18 | |||
19 | part part::createVerb() | ||
20 | { | ||
21 | return part(type::verb); | ||
22 | } | ||
23 | |||
24 | part part::createPreposition(std::set<std::string> choices, bool literal) | ||
25 | { | ||
26 | part p(type::preposition); | ||
27 | |||
28 | new(&p.preposition_.choices) std::set<std::string>(std::move(choices)); | ||
29 | p.preposition_.literal = literal; | ||
30 | |||
31 | return p; | ||
32 | } | ||
33 | |||
34 | part part::createAdjective() | ||
35 | { | ||
36 | return part(type::adjective); | ||
37 | } | ||
38 | |||
39 | part part::createAdverb() | ||
40 | { | ||
41 | return part(type::adverb); | ||
42 | } | ||
43 | |||
44 | part part::createLiteral(std::string value) | ||
45 | { | ||
46 | part p(type::literal); | ||
47 | |||
48 | new(&p.literal_) std::string(std::move(value)); | ||
49 | |||
50 | return p; | ||
51 | } | ||
52 | |||
53 | part::part(const part& other) | ||
54 | { | ||
55 | type_ = other.type_; | ||
56 | |||
57 | switch (type_) | ||
58 | { | ||
59 | case type::noun_phrase: | ||
60 | { | ||
61 | new(&noun_phrase_.role) std::string(other.noun_phrase_.role); | ||
62 | new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs); | ||
63 | new(&noun_phrase_.synrestrs) std::set<std::string>(other.noun_phrase_.synrestrs); | ||
64 | |||
65 | break; | ||
66 | } | ||
67 | |||
68 | case type::preposition: | ||
69 | { | ||
70 | new(&preposition_.choices) std::set<std::string>(other.preposition_.choices); | ||
71 | preposition_.literal = other.preposition_.literal; | ||
72 | |||
73 | break; | ||
74 | } | ||
75 | |||
76 | case type::literal: | ||
77 | { | ||
78 | new(&literal_) std::string(other.literal_); | ||
79 | |||
80 | break; | ||
81 | } | ||
82 | |||
83 | case type::verb: | ||
84 | case type::adjective: | ||
85 | case type::adverb: | ||
86 | case type::invalid: | ||
87 | { | ||
88 | break; | ||
89 | } | ||
90 | } | ||
91 | } | ||
92 | |||
93 | part::part(part&& other) : part() | ||
94 | { | ||
95 | swap(*this, other); | ||
96 | } | ||
97 | |||
98 | part& part::operator=(part other) | ||
99 | { | ||
100 | swap(*this, other); | ||
101 | |||
102 | return *this; | ||
103 | } | ||
104 | |||
105 | void swap(part& first, part& second) | ||
106 | { | ||
107 | using type = part::type; | ||
108 | |||
109 | type tempType = first.type_; | ||
110 | std::string tempRole; | ||
111 | selrestr tempSelrestrs; | ||
112 | std::set<std::string> tempSynrestrs; | ||
113 | std::set<std::string> tempChoices; | ||
114 | bool tempPrepLiteral; | ||
115 | std::string tempLiteralValue; | ||
116 | |||
117 | switch (tempType) | ||
118 | { | ||
119 | case type::noun_phrase: | ||
120 | { | ||
121 | tempRole = std::move(first.noun_phrase_.role); | ||
122 | tempSelrestrs = std::move(first.noun_phrase_.selrestrs); | ||
123 | tempSynrestrs = std::move(first.noun_phrase_.synrestrs); | ||
124 | |||
125 | break; | ||
126 | } | ||
127 | |||
128 | case type::preposition: | ||
129 | { | ||
130 | tempChoices = std::move(first.preposition_.choices); | ||
131 | tempPrepLiteral = first.preposition_.literal; | ||
132 | |||
133 | break; | ||
134 | } | ||
135 | |||
136 | case type::literal: | ||
137 | { | ||
138 | tempLiteralValue = std::move(first.literal_); | ||
139 | |||
140 | break; | ||
141 | } | ||
142 | |||
143 | case type::verb: | ||
144 | case type::adjective: | ||
145 | case type::adverb: | ||
146 | case type::invalid: | ||
147 | { | ||
148 | break; | ||
149 | } | ||
150 | } | ||
151 | |||
152 | first.~part(); | ||
153 | |||
154 | first.type_ = second.type_; | ||
155 | |||
156 | switch (first.type_) | ||
157 | { | ||
158 | case type::noun_phrase: | ||
159 | { | ||
160 | new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role)); | ||
161 | new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs)); | ||
162 | new(&first.noun_phrase_.synrestrs) std::set<std::string>(std::move(second.noun_phrase_.synrestrs)); | ||
163 | |||
164 | break; | ||
165 | } | ||
166 | |||
167 | case type::preposition: | ||
168 | { | ||
169 | new(&first.preposition_.choices) std::set<std::string>(std::move(second.preposition_.choices)); | ||
170 | first.preposition_.literal = second.preposition_.literal; | ||
171 | |||
172 | break; | ||
173 | } | ||
174 | |||
175 | case type::literal: | ||
176 | { | ||
177 | new(&first.literal_) std::string(std::move(second.literal_)); | ||
178 | |||
179 | break; | ||
180 | } | ||
181 | |||
182 | case type::verb: | ||
183 | case type::adjective: | ||
184 | case type::adverb: | ||
185 | case type::invalid: | ||
186 | { | ||
187 | break; | ||
188 | } | ||
189 | } | ||
190 | |||
191 | second.~part(); | ||
192 | |||
193 | second.type_ = tempType; | ||
194 | |||
195 | switch (second.type_) | ||
196 | { | ||
197 | case type::noun_phrase: | ||
198 | { | ||
199 | new(&second.noun_phrase_.role) std::string(std::move(tempRole)); | ||
200 | new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs)); | ||
201 | new(&second.noun_phrase_.synrestrs) std::set<std::string>(std::move(tempSynrestrs)); | ||
202 | |||
203 | break; | ||
204 | } | ||
205 | |||
206 | case type::preposition: | ||
207 | { | ||
208 | new(&second.preposition_.choices) std::set<std::string>(std::move(tempChoices)); | ||
209 | second.preposition_.literal = tempPrepLiteral; | ||
210 | |||
211 | break; | ||
212 | } | ||
213 | |||
214 | case type::literal: | ||
215 | { | ||
216 | new(&second.literal_) std::string(std::move(tempLiteralValue)); | ||
217 | |||
218 | break; | ||
219 | } | ||
220 | |||
221 | case type::verb: | ||
222 | case type::adjective: | ||
223 | case type::adverb: | ||
224 | case type::invalid: | ||
225 | { | ||
226 | break; | ||
227 | } | ||
228 | } | ||
229 | } | ||
230 | |||
231 | part::~part() | ||
232 | { | ||
233 | switch (type_) | ||
234 | { | ||
235 | case type::noun_phrase: | ||
236 | { | ||
237 | using string_type = std::string; | ||
238 | using set_type = std::set<std::string>; | ||
239 | |||
240 | noun_phrase_.role.~string_type(); | ||
241 | noun_phrase_.selrestrs.~selrestr(); | ||
242 | noun_phrase_.synrestrs.~set_type(); | ||
243 | |||
244 | break; | ||
245 | } | ||
246 | |||
247 | case type::preposition: | ||
248 | { | ||
249 | using set_type = std::set<std::string>; | ||
250 | |||
251 | preposition_.choices.~set_type(); | ||
252 | |||
253 | break; | ||
254 | } | ||
255 | |||
256 | case type::literal: | ||
257 | { | ||
258 | using string_type = std::string; | ||
259 | |||
260 | literal_.~string_type(); | ||
261 | |||
262 | break; | ||
263 | } | ||
264 | |||
265 | case type::verb: | ||
266 | case type::adjective: | ||
267 | case type::adverb: | ||
268 | case type::invalid: | ||
269 | { | ||
270 | break; | ||
271 | } | ||
272 | } | ||
273 | } | ||
274 | |||
275 | std::string part::getNounRole() const | ||
276 | { | ||
277 | if (type_ == type::noun_phrase) | ||
278 | { | ||
279 | return noun_phrase_.role; | ||
280 | } else { | ||
281 | throw std::domain_error("part::getNounRole is only valid for noun phrase parts"); | ||
282 | } | ||
283 | } | ||
284 | |||
285 | selrestr part::getNounSelrestrs() const | ||
286 | { | ||
287 | if (type_ == type::noun_phrase) | ||
288 | { | ||
289 | return noun_phrase_.selrestrs; | ||
290 | } else { | ||
291 | throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts"); | ||
292 | } | ||
293 | } | ||
294 | |||
295 | std::set<std::string> part::getNounSynrestrs() const | ||
296 | { | ||
297 | if (type_ == type::noun_phrase) | ||
298 | { | ||
299 | return noun_phrase_.synrestrs; | ||
300 | } else { | ||
301 | throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts"); | ||
302 | } | ||
303 | } | ||
304 | |||
305 | std::set<std::string> part::getPrepositionChoices() const | ||
306 | { | ||
307 | if (type_ == type::preposition) | ||
308 | { | ||
309 | return preposition_.choices; | ||
310 | } else { | ||
311 | throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts"); | ||
312 | } | ||
313 | } | ||
314 | |||
315 | bool part::isPrepositionLiteral() const | ||
316 | { | ||
317 | if (type_ == type::preposition) | ||
318 | { | ||
319 | return preposition_.literal; | ||
320 | } else { | ||
321 | throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts"); | ||
322 | } | ||
323 | } | ||
324 | |||
325 | std::string part::getLiteralValue() const | ||
326 | { | ||
327 | if (type_ == type::literal) | ||
328 | { | ||
329 | return literal_; | ||
330 | } else { | ||
331 | throw std::domain_error("part::getLiteralValue is only valid for literal parts"); | ||
332 | } | ||
333 | } | ||
334 | |||
335 | }; | ||
336 | }; | ||
diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h | |||
@@ -0,0 +1,114 @@ | |||
1 | #ifndef PART_H_FB54F361 | ||
2 | #define PART_H_FB54F361 | ||
3 | |||
4 | #include <string> | ||
5 | #include <set> | ||
6 | #include "selrestr.h" | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | class part { | ||
12 | public: | ||
13 | enum class type { | ||
14 | invalid = -1, | ||
15 | noun_phrase = 0, | ||
16 | verb = 1, | ||
17 | preposition = 2, | ||
18 | adjective = 3, | ||
19 | adverb = 4, | ||
20 | literal = 5 | ||
21 | }; | ||
22 | |||
23 | // Static factories | ||
24 | |||
25 | static part createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs); | ||
26 | |||
27 | static part createVerb(); | ||
28 | |||
29 | static part createPreposition(std::set<std::string> choices, bool literal); | ||
30 | |||
31 | static part createAdjective(); | ||
32 | |||
33 | static part createAdverb(); | ||
34 | |||
35 | static part createLiteral(std::string value); | ||
36 | |||
37 | // Copy and move constructors | ||
38 | |||
39 | part(const part& other); | ||
40 | |||
41 | part(part&& other); | ||
42 | |||
43 | // Assignment | ||
44 | |||
45 | part& operator=(part other); | ||
46 | |||
47 | // Swap | ||
48 | |||
49 | friend void swap(part& first, part& second); | ||
50 | |||
51 | // Destructor | ||
52 | |||
53 | ~part(); | ||
54 | |||
55 | // General accessors | ||
56 | |||
57 | type getType() const | ||
58 | { | ||
59 | return type_; | ||
60 | } | ||
61 | |||
62 | // Noun phrase accessors | ||
63 | |||
64 | std::string getNounRole() const; | ||
65 | |||
66 | selrestr getNounSelrestrs() const; | ||
67 | |||
68 | std::set<std::string> getNounSynrestrs() const; | ||
69 | |||
70 | // Preposition accessors | ||
71 | |||
72 | std::set<std::string> getPrepositionChoices() const; | ||
73 | |||
74 | bool isPrepositionLiteral() const; | ||
75 | |||
76 | // Literal accessors | ||
77 | |||
78 | std::string getLiteralValue() const; | ||
79 | |||
80 | private: | ||
81 | |||
82 | // Private constructors | ||
83 | |||
84 | part() | ||
85 | { | ||
86 | } | ||
87 | |||
88 | part(type t) : type_(t) | ||
89 | { | ||
90 | } | ||
91 | |||
92 | // Data | ||
93 | |||
94 | union { | ||
95 | struct { | ||
96 | std::string role; | ||
97 | selrestr selrestrs; | ||
98 | std::set<std::string> synrestrs; | ||
99 | } noun_phrase_; | ||
100 | struct { | ||
101 | std::set<std::string> choices; | ||
102 | bool literal; | ||
103 | } preposition_; | ||
104 | std::string literal_; | ||
105 | }; | ||
106 | |||
107 | type type_ = type::invalid; | ||
108 | |||
109 | }; | ||
110 | |||
111 | }; | ||
112 | }; | ||
113 | |||
114 | #endif /* end of include guard: PART_H_FB54F361 */ | ||
diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h | |||
@@ -3,48 +3,54 @@ | |||
3 | 3 | ||
4 | #include <string> | 4 | #include <string> |
5 | 5 | ||
6 | class progress { | 6 | namespace verbly { |
7 | private: | 7 | namespace generator { |
8 | std::string message; | ||
9 | int total; | ||
10 | int cur = 0; | ||
11 | int lprint = 0; | ||
12 | 8 | ||
13 | public: | 9 | class progress { |
14 | progress(std::string message, int total) : message(message), total(total) | 10 | private: |
15 | { | 11 | std::string message; |
16 | std::cout << message << " 0%" << std::flush; | 12 | int total; |
17 | } | 13 | int cur = 0; |
14 | int lprint = 0; | ||
18 | 15 | ||
19 | void update(int val) | 16 | public: |
20 | { | 17 | progress(std::string message, int total) : message(message), total(total) |
21 | if (val <= total) | 18 | { |
22 | { | 19 | std::cout << message << " 0%" << std::flush; |
23 | cur = val; | 20 | } |
24 | } else { | 21 | |
25 | cur = total; | 22 | void update(int val) |
26 | } | 23 | { |
24 | if (val <= total) | ||
25 | { | ||
26 | cur = val; | ||
27 | } else { | ||
28 | cur = total; | ||
29 | } | ||
27 | 30 | ||
28 | int pp = cur * 100 / total; | 31 | int pp = cur * 100 / total; |
29 | if (pp != lprint) | 32 | if (pp != lprint) |
30 | { | 33 | { |
31 | lprint = pp; | 34 | lprint = pp; |
32 | 35 | ||
33 | std::cout << "\b\b\b\b" << std::right; | 36 | std::cout << "\b\b\b\b" << std::right; |
34 | std::cout.width(3); | 37 | std::cout.width(3); |
35 | std::cout << pp << "%" << std::flush; | 38 | std::cout << pp << "%" << std::flush; |
36 | } | 39 | } |
37 | } | 40 | } |
41 | |||
42 | void update() | ||
43 | { | ||
44 | update(cur+1); | ||
45 | } | ||
38 | 46 | ||
39 | void update() | 47 | ~progress() |
40 | { | 48 | { |
41 | update(cur+1); | 49 | std::cout << "\b\b\b\b100%" << std::endl; |
42 | } | 50 | } |
51 | }; | ||
43 | 52 | ||
44 | ~progress() | 53 | }; |
45 | { | ||
46 | std::cout << "\b\b\b\b100%" << std::endl; | ||
47 | } | ||
48 | }; | 54 | }; |
49 | 55 | ||
50 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ | 56 | #endif /* end of include guard: PROGRESS_H_A34EF856 */ |
diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp | |||
@@ -0,0 +1,87 @@ | |||
1 | #include "pronunciation.h" | ||
2 | #include <list> | ||
3 | #include <algorithm> | ||
4 | #include <cctype> | ||
5 | #include <iterator> | ||
6 | #include "database.h" | ||
7 | #include "field.h" | ||
8 | #include "../lib/util.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | int pronunciation::nextId_ = 0; | ||
14 | |||
15 | pronunciation::pronunciation(std::string phonemes) : | ||
16 | id_(nextId_++), | ||
17 | phonemes_(phonemes) | ||
18 | { | ||
19 | auto phonemeList = split<std::list<std::string>>(phonemes, " "); | ||
20 | |||
21 | auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) { | ||
22 | return phoneme.find("1") != std::string::npos; | ||
23 | }); | ||
24 | |||
25 | // Rhyme detection | ||
26 | if (rhymeStart != std::end(phonemeList)) | ||
27 | { | ||
28 | std::list<std::string> rhymePhonemes; | ||
29 | |||
30 | std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) { | ||
31 | std::string naked; | ||
32 | |||
33 | std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) { | ||
34 | return std::isdigit(ch); | ||
35 | }); | ||
36 | |||
37 | return naked; | ||
38 | }); | ||
39 | |||
40 | rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); | ||
41 | |||
42 | if (rhymeStart != std::begin(phonemeList)) | ||
43 | { | ||
44 | prerhyme_ = *std::prev(rhymeStart); | ||
45 | } | ||
46 | } | ||
47 | |||
48 | // Syllable/stress | ||
49 | for (std::string phoneme : phonemeList) | ||
50 | { | ||
51 | if (std::isdigit(phoneme.back())) | ||
52 | { | ||
53 | // It's a vowel! | ||
54 | syllables_++; | ||
55 | |||
56 | if (phoneme.back() == '1') | ||
57 | { | ||
58 | stress_.push_back('1'); | ||
59 | } else { | ||
60 | stress_.push_back('0'); | ||
61 | } | ||
62 | } | ||
63 | } | ||
64 | } | ||
65 | |||
66 | database& operator<<(database& db, const pronunciation& arg) | ||
67 | { | ||
68 | std::list<field> fields; | ||
69 | |||
70 | fields.emplace_back("pronunciation_id", arg.getId()); | ||
71 | fields.emplace_back("phonemes", arg.getPhonemes()); | ||
72 | fields.emplace_back("syllables", arg.getSyllables()); | ||
73 | fields.emplace_back("stress", arg.getStress()); | ||
74 | |||
75 | if (arg.hasRhyme()) | ||
76 | { | ||
77 | fields.emplace_back("rhyme", arg.getRhymePhonemes()); | ||
78 | fields.emplace_back("prerhyme", arg.getPrerhyme()); | ||
79 | } | ||
80 | |||
81 | db.insertIntoTable("pronunciations", std::move(fields)); | ||
82 | |||
83 | return db; | ||
84 | } | ||
85 | |||
86 | }; | ||
87 | }; | ||
diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h | |||
@@ -0,0 +1,82 @@ | |||
1 | #ifndef PRONUNCIATION_H_584A08DD | ||
2 | #define PRONUNCIATION_H_584A08DD | ||
3 | |||
4 | #include <string> | ||
5 | #include <cassert> | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class database; | ||
11 | |||
12 | class pronunciation { | ||
13 | public: | ||
14 | |||
15 | // Constructor | ||
16 | |||
17 | explicit pronunciation(std::string phonemes); | ||
18 | |||
19 | // Accessors | ||
20 | |||
21 | int getId() const | ||
22 | { | ||
23 | return id_; | ||
24 | } | ||
25 | |||
26 | std::string getPhonemes() const | ||
27 | { | ||
28 | return phonemes_; | ||
29 | } | ||
30 | |||
31 | bool hasRhyme() const | ||
32 | { | ||
33 | return !rhyme_.empty(); | ||
34 | } | ||
35 | |||
36 | std::string getRhymePhonemes() const | ||
37 | { | ||
38 | // Calling code should always call hasRhyme first. | ||
39 | assert(!rhyme_.empty()); | ||
40 | |||
41 | return rhyme_; | ||
42 | } | ||
43 | |||
44 | std::string getPrerhyme() const | ||
45 | { | ||
46 | // Calling code should always call hasRhyme first. | ||
47 | assert(!rhyme_.empty()); | ||
48 | |||
49 | return prerhyme_; | ||
50 | } | ||
51 | |||
52 | int getSyllables() const | ||
53 | { | ||
54 | return syllables_; | ||
55 | } | ||
56 | |||
57 | std::string getStress() const | ||
58 | { | ||
59 | return stress_; | ||
60 | } | ||
61 | |||
62 | private: | ||
63 | |||
64 | static int nextId_; | ||
65 | |||
66 | const int id_; | ||
67 | const std::string phonemes_; | ||
68 | std::string rhyme_; | ||
69 | std::string prerhyme_; | ||
70 | int syllables_ = 0; | ||
71 | std::string stress_; | ||
72 | |||
73 | }; | ||
74 | |||
75 | // Serializer | ||
76 | |||
77 | database& operator<<(database& db, const pronunciation& arg); | ||
78 | |||
79 | }; | ||
80 | }; | ||
81 | |||
82 | #endif /* end of include guard: PRONUNCIATION_H_584A08DD */ | ||
diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h | |||
@@ -0,0 +1,35 @@ | |||
1 | #ifndef ROLE_H_249F9A9C | ||
2 | #define ROLE_H_249F9A9C | ||
3 | |||
4 | #include "selrestr.h" | ||
5 | |||
6 | namespace verbly { | ||
7 | namespace generator { | ||
8 | |||
9 | class role { | ||
10 | public: | ||
11 | |||
12 | // Mutators | ||
13 | |||
14 | void setSelrestrs(selrestr selrestrs) | ||
15 | { | ||
16 | selrestrs_ = selrestrs; | ||
17 | } | ||
18 | |||
19 | // Accessors | ||
20 | |||
21 | const selrestr& getSelrestrs() const | ||
22 | { | ||
23 | return selrestrs_; | ||
24 | } | ||
25 | |||
26 | private: | ||
27 | |||
28 | selrestr selrestrs_; | ||
29 | |||
30 | }; | ||
31 | |||
32 | }; | ||
33 | }; | ||
34 | |||
35 | #endif /* end of include guard: ROLE_H_249F9A9C */ | ||
diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
@@ -1,286 +1,204 @@ | |||
1 | DROP TABLE IF EXISTS `verbs`; | 1 | CREATE TABLE `notions` ( |
2 | CREATE TABLE `verbs` ( | 2 | `notion_id` INTEGER PRIMARY KEY, |
3 | `verb_id` INTEGER PRIMARY KEY, | 3 | `part_of_speech` SMALLINT NOT NULL, |
4 | `infinitive` VARCHAR(32) NOT NULL, | 4 | `wnid` INTEGER, |
5 | `past_tense` VARCHAR(32) NOT NULL, | 5 | `images` INTEGER |
6 | `past_participle` VARCHAR(32) NOT NULL, | ||
7 | `ing_form` VARCHAR(32) NOT NULL, | ||
8 | `s_form` VARCHAR(32) NOT NULL | ||
9 | ); | 6 | ); |
10 | 7 | ||
11 | DROP TABLE IF EXISTS `groups`; | 8 | CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`); |
12 | CREATE TABLE `groups` ( | ||
13 | `group_id` INTEGER PRIMARY KEY, | ||
14 | `data` BLOB NOT NULL | ||
15 | ); | ||
16 | |||
17 | DROP TABLE IF EXISTS `frames`; | ||
18 | CREATE TABLE `frames` ( | ||
19 | `frame_id` INTEGER PRIMARY KEY, | ||
20 | `group_id` INTEGER NOT NULL, | ||
21 | `data` BLOB NOT NULL, | ||
22 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
23 | ); | ||
24 | 9 | ||
25 | DROP TABLE IF EXISTS `verb_groups`; | ||
26 | CREATE TABLE `verb_groups` ( | ||
27 | `verb_id` INTEGER NOT NULL, | ||
28 | `group_id` INTEGER NOT NULL, | ||
29 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`), | ||
30 | FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`) | ||
31 | ); | ||
32 | |||
33 | DROP TABLE IF EXISTS `adjectives`; | ||
34 | CREATE TABLE `adjectives` ( | ||
35 | `adjective_id` INTEGER PRIMARY KEY, | ||
36 | `base_form` VARCHAR(32) NOT NULL, | ||
37 | `comparative` VARCHAR(32), | ||
38 | `superlative` VARCHAR(32), | ||
39 | `position` CHAR(1), | ||
40 | `complexity` INTEGER NOT NULL | ||
41 | ); | ||
42 | |||
43 | DROP TABLE IF EXISTS `adverbs`; | ||
44 | CREATE TABLE `adverbs` ( | ||
45 | `adverb_id` INTEGER PRIMARY KEY, | ||
46 | `base_form` VARCHAR(32) NOT NULL, | ||
47 | `comparative` VARCHAR(32), | ||
48 | `superlative` VARCHAR(32), | ||
49 | `complexity` INTEGER NOT NULL | ||
50 | ); | ||
51 | |||
52 | DROP TABLE IF EXISTS `nouns`; | ||
53 | CREATE TABLE `nouns` ( | ||
54 | `noun_id` INTEGER PRIMARY KEY, | ||
55 | `singular` VARCHAR(32) NOT NULL, | ||
56 | `plural` VARCHAR(32), | ||
57 | `proper` INTEGER(1) NOT NULL, | ||
58 | `complexity` INTEGER NOT NULL, | ||
59 | `images` INTEGER NOT NULL, | ||
60 | `wnid` INTEGER NOT NULL | ||
61 | ); | ||
62 | |||
63 | DROP TABLE IF EXISTS `hypernymy`; | ||
64 | CREATE TABLE `hypernymy` ( | 10 | CREATE TABLE `hypernymy` ( |
65 | `hypernym_id` INTEGER NOT NULL, | 11 | `hypernym_id` INTEGER NOT NULL, |
66 | `hyponym_id` INTEGER NOT NULL, | 12 | `hyponym_id` INTEGER NOT NULL |
67 | FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`), | ||
68 | FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`) | ||
69 | ); | 13 | ); |
70 | 14 | ||
71 | DROP TABLE IF EXISTS `instantiation`; | 15 | CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`); |
16 | CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`); | ||
17 | |||
72 | CREATE TABLE `instantiation` ( | 18 | CREATE TABLE `instantiation` ( |
73 | `class_id` INTEGER NOT NULL, | 19 | `class_id` INTEGER NOT NULL, |
74 | `instance_id` INTEGER NOT NULL, | 20 | `instance_id` INTEGER NOT NULL |
75 | FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`), | ||
76 | FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`) | ||
77 | ); | 21 | ); |
78 | 22 | ||
79 | DROP TABLE IF EXISTS `member_meronymy`; | 23 | CREATE INDEX `instance_of` ON `instantiation`(`class_id`); |
24 | CREATE INDEX `class_of` ON `instantiation`(`instance_id`); | ||
25 | |||
80 | CREATE TABLE `member_meronymy` ( | 26 | CREATE TABLE `member_meronymy` ( |
81 | `meronym_id` INTEGER NOT NULL, | 27 | `meronym_id` INTEGER NOT NULL, |
82 | `holonym_id` INTEGER NOT NULL, | 28 | `holonym_id` INTEGER NOT NULL |
83 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
84 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
85 | ); | 29 | ); |
86 | 30 | ||
87 | DROP TABLE IF EXISTS `part_meronymy`; | 31 | CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`); |
32 | CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`); | ||
33 | |||
88 | CREATE TABLE `part_meronymy` ( | 34 | CREATE TABLE `part_meronymy` ( |
89 | `meronym_id` INTEGER NOT NULL, | 35 | `meronym_id` INTEGER NOT NULL, |
90 | `holonym_id` INTEGER NOT NULL, | 36 | `holonym_id` INTEGER NOT NULL |
91 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
92 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
93 | ); | 37 | ); |
94 | 38 | ||
95 | DROP TABLE IF EXISTS `substance_meronymy`; | 39 | CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`); |
40 | CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`); | ||
41 | |||
96 | CREATE TABLE `substance_meronymy` ( | 42 | CREATE TABLE `substance_meronymy` ( |
97 | `meronym_id` INTEGER NOT NULL, | 43 | `meronym_id` INTEGER NOT NULL, |
98 | `holonym_id` INTEGER NOT NULL, | 44 | `holonym_id` INTEGER NOT NULL |
99 | FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`), | ||
100 | FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`) | ||
101 | ); | 45 | ); |
102 | 46 | ||
103 | DROP TABLE IF EXISTS `variation`; | 47 | CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`); |
48 | CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`); | ||
49 | |||
104 | CREATE TABLE `variation` ( | 50 | CREATE TABLE `variation` ( |
105 | `noun_id` INTEGER NOT NULL, | 51 | `noun_id` INTEGER NOT NULL, |
106 | `adjective_id` INTEGER NOT NULL, | 52 | `adjective_id` INTEGER NOT NULL |
107 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
108 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
109 | ); | 53 | ); |
110 | 54 | ||
111 | DROP TABLE IF EXISTS `noun_antonymy`; | 55 | CREATE INDEX `variant_of` ON `variation`(`noun_id`); |
112 | CREATE TABLE `noun_antonymy` ( | 56 | CREATE INDEX `attribute_of` ON `variation`(`adjective_id`); |
113 | `noun_1_id` INTEGER NOT NULL, | ||
114 | `noun_2_id` INTEGER NOT NULL, | ||
115 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | ||
116 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
117 | ); | ||
118 | 57 | ||
119 | DROP TABLE IF EXISTS `adjective_antonymy`; | 58 | CREATE TABLE `similarity` ( |
120 | CREATE TABLE `adjective_antonymy` ( | ||
121 | `adjective_1_id` INTEGER NOT NULL, | 59 | `adjective_1_id` INTEGER NOT NULL, |
122 | `adjective_2_id` INTEGER NOT NULL, | 60 | `adjective_2_id` INTEGER NOT NULL |
123 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 61 | ); |
124 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 62 | |
63 | CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`); | ||
64 | |||
65 | CREATE TABLE `is_a` ( | ||
66 | `notion_id` INTEGER NOT NULL, | ||
67 | `groupname` VARCHAR(32) NOT NULL | ||
125 | ); | 68 | ); |
126 | 69 | ||
127 | DROP TABLE IF EXISTS `adverb_antonymy`; | 70 | CREATE TABLE `entailment` ( |
128 | CREATE TABLE `adverb_antonymy` ( | 71 | `given_id` INTEGER NOT NULL, |
129 | `adverb_1_id` INTEGER NOT NULL, | 72 | `entailment_id` INTEGER NOT NULL |
130 | `adverb_2_id` INTEGER NOT NULL, | 73 | ); |
131 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 74 | |
132 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 75 | CREATE INDEX `entailment_of` ON `entailment`(`given_id`); |
76 | CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`); | ||
77 | |||
78 | CREATE TABLE `causality` ( | ||
79 | `cause_id` INTEGER NOT NULL, | ||
80 | `effect_id` INTEGER NOT NULL | ||
81 | ); | ||
82 | |||
83 | CREATE INDEX `effect_of` ON `causality`(`cause_id`); | ||
84 | CREATE INDEX `cause_of` ON `causality`(`effect_id`); | ||
85 | |||
86 | CREATE TABLE `words` ( | ||
87 | `word_id` INTEGER PRIMARY KEY, | ||
88 | `notion_id` INTEGER NOT NULL, | ||
89 | `lemma_id` INTEGER NOT NULL, | ||
90 | `tag_count` INTEGER, | ||
91 | `position` SMALLINT, | ||
92 | `group_id` INTEGER | ||
93 | ); | ||
94 | |||
95 | CREATE INDEX `notion_words` ON `words`(`notion_id`); | ||
96 | CREATE INDEX `lemma_words` ON `words`(`lemma_id`); | ||
97 | CREATE INDEX `group_words` ON `words`(`group_id`); | ||
98 | |||
99 | CREATE TABLE `antonymy` ( | ||
100 | `antonym_1_id` INTEGER NOT NULL, | ||
101 | `antonym_2_id` INTEGER NOT NULL | ||
133 | ); | 102 | ); |
134 | 103 | ||
135 | DROP TABLE IF EXISTS `specification`; | 104 | CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`); |
105 | |||
136 | CREATE TABLE `specification` ( | 106 | CREATE TABLE `specification` ( |
137 | `general_id` INTEGER NOT NULL, | 107 | `general_id` INTEGER NOT NULL, |
138 | `specific_id` INTEGER NOT NULL, | 108 | `specific_id` INTEGER NOT NULL |
139 | FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`), | ||
140 | FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`) | ||
141 | ); | 109 | ); |
142 | 110 | ||
143 | DROP TABLE IF EXISTS `pertainymy`; | 111 | CREATE INDEX `specification_of` ON `specification`(`general_id`); |
112 | CREATE INDEX `generalization_of` ON `specification`(`specific_id`); | ||
113 | |||
144 | CREATE TABLE `pertainymy` ( | 114 | CREATE TABLE `pertainymy` ( |
145 | `noun_id` INTEGER NOT NULL, | 115 | `noun_id` INTEGER NOT NULL, |
146 | `pertainym_id` INTEGER NOT NULL, | 116 | `pertainym_id` INTEGER NOT NULL |
147 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
148 | FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`) | ||
149 | ); | 117 | ); |
150 | 118 | ||
151 | DROP TABLE IF EXISTS `mannernymy`; | 119 | CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`); |
120 | CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`); | ||
121 | |||
152 | CREATE TABLE `mannernymy` ( | 122 | CREATE TABLE `mannernymy` ( |
153 | `adjective_id` INTEGER NOT NULL, | 123 | `adjective_id` INTEGER NOT NULL, |
154 | `mannernym_id` INTEGER NOT NULL, | 124 | `mannernym_id` INTEGER NOT NULL |
155 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
156 | FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`) | ||
157 | ); | 125 | ); |
158 | 126 | ||
159 | DROP TABLE IF EXISTS `noun_synonymy`; | 127 | CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`); |
160 | CREATE TABLE `noun_synonymy` ( | 128 | CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`); |
161 | `noun_1_id` INTEGER NOT NULL, | ||
162 | `noun_2_id` INTEGER NOT NULL, | ||
163 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`), | ||
164 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`) | ||
165 | ); | ||
166 | 129 | ||
167 | DROP TABLE IF EXISTS `adjective_synonymy`; | 130 | CREATE TABLE `usage` ( |
168 | CREATE TABLE `adjective_synonymy` ( | 131 | `domain_id` INTEGER NOT NULL, |
169 | `adjective_1_id` INTEGER NOT NULL, | 132 | `term_id` INTEGER NOT NULL |
170 | `adjective_2_id` INTEGER NOT NULL, | ||
171 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | ||
172 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | ||
173 | ); | 133 | ); |
174 | 134 | ||
175 | DROP TABLE IF EXISTS `adverb_synonymy`; | 135 | CREATE INDEX `usage_term_of` ON `usage`(`domain_id`); |
176 | CREATE TABLE `adverb_synonymy` ( | 136 | CREATE INDEX `usage_domain_of` ON `usage`(`term_id`); |
177 | `adverb_1_id` INTEGER NOT NULL, | ||
178 | `adverb_2_id` INTEGER NOT NULL, | ||
179 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | ||
180 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | ||
181 | ); | ||
182 | 137 | ||
183 | DROP TABLE IF EXISTS `noun_pronunciations`; | 138 | CREATE TABLE `topicality` ( |
184 | CREATE TABLE `noun_pronunciations` ( | 139 | `domain_id` INTEGER NOT NULL, |
185 | `noun_id` INTEGER NOT NULL, | 140 | `term_id` INTEGER NOT NULL |
186 | `pronunciation` VARCHAR(64) NOT NULL, | ||
187 | `prerhyme` VARCHAR(8), | ||
188 | `rhyme` VARCHAR(64), | ||
189 | `syllables` INT NOT NULL, | ||
190 | `stress` VARCHAR(64) NOT NULL, | ||
191 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`) | ||
192 | ); | 141 | ); |
193 | 142 | ||
194 | DROP TABLE IF EXISTS `verb_pronunciations`; | 143 | CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`); |
195 | CREATE TABLE `verb_pronunciations` ( | 144 | CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`); |
196 | `verb_id` INTEGER NOT NULL, | ||
197 | `pronunciation` VARCHAR(64) NOT NULL, | ||
198 | `prerhyme` VARCHAR(8), | ||
199 | `rhyme` VARCHAR(64), | ||
200 | `syllables` INT NOT NULL, | ||
201 | `stress` VARCHAR(64) NOT NULL, | ||
202 | FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`) | ||
203 | ); | ||
204 | 145 | ||
205 | DROP TABLE IF EXISTS `adjective_pronunciations`; | 146 | CREATE TABLE `regionality` ( |
206 | CREATE TABLE `adjective_pronunciations` ( | 147 | `domain_id` INTEGER NOT NULL, |
207 | `adjective_id` INTEGER NOT NULL, | 148 | `term_id` INTEGER NOT NULL |
208 | `pronunciation` VARCHAR(64) NOT NULL, | ||
209 | `prerhyme` VARCHAR(8), | ||
210 | `rhyme` VARCHAR(64), | ||
211 | `syllables` INT NOT NULL, | ||
212 | `stress` VARCHAR(64) NOT NULL, | ||
213 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
214 | ); | 149 | ); |
215 | 150 | ||
216 | DROP TABLE IF EXISTS `adverb_pronunciations`; | 151 | CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`); |
217 | CREATE TABLE `adverb_pronunciations` ( | 152 | CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`); |
218 | `adverb_id` INTEGER NOT NULL, | ||
219 | `pronunciation` VARCHAR(64) NOT NULL, | ||
220 | `prerhyme` VARCHAR(8), | ||
221 | `rhyme` VARCHAR(64), | ||
222 | `syllables` INT NOT NULL, | ||
223 | `stress` VARCHAR(64) NOT NULL, | ||
224 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
225 | ); | ||
226 | 153 | ||
227 | DROP TABLE IF EXISTS `noun_noun_derivation`; | 154 | CREATE TABLE `forms` ( |
228 | CREATE TABLE `noun_noun_derivation` ( | 155 | `form_id` INTEGER PRIMARY KEY, |
229 | `noun_1_id` INTEGER NOT NULL, | 156 | `form` VARCHAR(32) NOT NULL, |
230 | `noun_2_id` INTEGER NOT NULL, | 157 | `complexity` SMALLINT NOT NULL, |
231 | FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), | 158 | `proper` SMALLINT NOT NULL |
232 | FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`) | ||
233 | ); | 159 | ); |
234 | 160 | ||
235 | DROP TABLE IF EXISTS `noun_adjective_derivation`; | 161 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); |
236 | CREATE TABLE `noun_adjective_derivation` ( | ||
237 | `noun_id` INTEGER NOT NULL, | ||
238 | `adjective_id` INTEGER NOT NULL, | ||
239 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
240 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`) | ||
241 | ); | ||
242 | 162 | ||
243 | DROP TABLE IF EXISTS `noun_adverb_derivation`; | 163 | CREATE TABLE `lemmas_forms` ( |
244 | CREATE TABLE `noun_adverb_derivation` ( | 164 | `lemma_id` INTEGER NOT NULL, |
245 | `noun_id` INTEGER NOT NULL, | 165 | `form_id` INTEGER NOT NULL, |
246 | `adverb_id` INTEGER NOT NULL, | 166 | `category` SMALLINT NOT NULL |
247 | FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`), | ||
248 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`) | ||
249 | ); | 167 | ); |
250 | 168 | ||
251 | DROP TABLE IF EXISTS `adjective_adjective_derivation`; | 169 | CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`); |
252 | CREATE TABLE `adjective_adjective_derivation` ( | 170 | CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`); |
253 | `adjective_1_id` INTEGER NOT NULL, | 171 | |
254 | `adjective_2_id` INTEGER NOT NULL, | 172 | CREATE TABLE `pronunciations` ( |
255 | FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), | 173 | `pronunciation_id` INTEGER PRIMARY KEY, |
256 | FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) | 174 | `phonemes` VARCHAR(64) NOT NULL, |
175 | `prerhyme` VARCHAR(8), | ||
176 | `rhyme` VARCHAR(64), | ||
177 | `syllables` INTEGER NOT NULL, | ||
178 | `stress` VARCHAR(64) NOT NULL | ||
257 | ); | 179 | ); |
258 | 180 | ||
259 | DROP TABLE IF EXISTS `adjective_adverb_derivation`; | 181 | CREATE TABLE `forms_pronunciations` ( |
260 | CREATE TABLE `adjective_adverb_derivation` ( | 182 | `form_id` INTEGER NOT NULL, |
261 | `adjective_id` INTEGER NOT NULL, | 183 | `pronunciation_id` INTEGER NOT NULL |
262 | `adverb_id` INTEGER NOT NULL, | ||
263 | FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`), | ||
264 | FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`) | ||
265 | ); | 184 | ); |
266 | 185 | ||
267 | DROP TABLE IF EXISTS `adverb_adverb_derivation`; | 186 | CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`); |
268 | CREATE TABLE `adverb_adverb_derivation` ( | 187 | CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`); |
269 | `adverb_1_id` INTEGER NOT NULL, | 188 | |
270 | `adverb_2_id` INTEGER NOT NULL, | 189 | CREATE TABLE `groups` ( |
271 | FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), | 190 | `group_id` INTEGER PRIMARY KEY, |
272 | FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) | 191 | `data` BLOB NOT NULL |
273 | ); | 192 | ); |
274 | 193 | ||
275 | DROP TABLE IF EXISTS `prepositions`; | 194 | CREATE TABLE `frames` ( |
276 | CREATE TABLE `prepositions` ( | 195 | `frame_id` INTEGER PRIMARY KEY, |
277 | `preposition_id` INTEGER PRIMARY KEY, | 196 | `data` BLOB NOT NULL |
278 | `form` VARCHAR(32) NOT NULL | ||
279 | ); | 197 | ); |
280 | 198 | ||
281 | DROP TABLE IF EXISTS `preposition_groups`; | 199 | CREATE TABLE `groups_frames` ( |
282 | CREATE TABLE `preposition_groups` ( | 200 | `group_id` INTEGER NOT NULL, |
283 | `preposition_id` INTEGER NOT NULL, | 201 | `frame_id` INTEGER NOT NULL |
284 | `groupname` VARCHAR(32) NOT NULL, | ||
285 | FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`) | ||
286 | ); | 202 | ); |
203 | |||
204 | CREATE INDEX `frames_in` ON `groups_frames`(`group_id`); | ||
diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp | |||
@@ -0,0 +1,288 @@ | |||
1 | #include "selrestr.h" | ||
2 | |||
3 | namespace verbly { | ||
4 | namespace generator { | ||
5 | |||
6 | selrestr::selrestr(const selrestr& other) | ||
7 | { | ||
8 | type_ = other.type_; | ||
9 | |||
10 | switch (type_) | ||
11 | { | ||
12 | case type::singleton: | ||
13 | { | ||
14 | singleton_.pos = other.singleton_.pos; | ||
15 | new(&singleton_.restriction) std::string(other.singleton_.restriction); | ||
16 | |||
17 | break; | ||
18 | } | ||
19 | |||
20 | case type::group: | ||
21 | { | ||
22 | new(&group_.children) std::list<selrestr>(other.group_.children); | ||
23 | group_.orlogic = other.group_.orlogic; | ||
24 | |||
25 | break; | ||
26 | } | ||
27 | |||
28 | case type::empty: | ||
29 | { | ||
30 | break; | ||
31 | } | ||
32 | } | ||
33 | } | ||
34 | |||
35 | selrestr::selrestr(selrestr&& other) : selrestr() | ||
36 | { | ||
37 | swap(*this, other); | ||
38 | } | ||
39 | |||
40 | selrestr& selrestr::operator=(selrestr other) | ||
41 | { | ||
42 | swap(*this, other); | ||
43 | |||
44 | return *this; | ||
45 | } | ||
46 | |||
47 | void swap(selrestr& first, selrestr& second) | ||
48 | { | ||
49 | using type = selrestr::type; | ||
50 | |||
51 | type tempType = first.type_; | ||
52 | int tempPos; | ||
53 | std::string tempRestriction; | ||
54 | std::list<selrestr> tempChildren; | ||
55 | bool tempOrlogic; | ||
56 | |||
57 | switch (tempType) | ||
58 | { | ||
59 | case type::singleton: | ||
60 | { | ||
61 | tempPos = first.singleton_.pos; | ||
62 | tempRestriction = std::move(first.singleton_.restriction); | ||
63 | |||
64 | break; | ||
65 | } | ||
66 | |||
67 | case type::group: | ||
68 | { | ||
69 | tempChildren = std::move(first.group_.children); | ||
70 | tempOrlogic = first.group_.orlogic; | ||
71 | |||
72 | break; | ||
73 | } | ||
74 | |||
75 | case type::empty: | ||
76 | { | ||
77 | break; | ||
78 | } | ||
79 | } | ||
80 | |||
81 | first.~selrestr(); | ||
82 | |||
83 | first.type_ = second.type_; | ||
84 | |||
85 | switch (first.type_) | ||
86 | { | ||
87 | case type::singleton: | ||
88 | { | ||
89 | first.singleton_.pos = second.singleton_.pos; | ||
90 | new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction)); | ||
91 | |||
92 | break; | ||
93 | } | ||
94 | |||
95 | case type::group: | ||
96 | { | ||
97 | new(&first.group_.children) std::list<selrestr>(std::move(second.group_.children)); | ||
98 | first.group_.orlogic = second.group_.orlogic; | ||
99 | |||
100 | break; | ||
101 | } | ||
102 | |||
103 | case type::empty: | ||
104 | { | ||
105 | break; | ||
106 | } | ||
107 | } | ||
108 | |||
109 | second.~selrestr(); | ||
110 | |||
111 | second.type_ = tempType; | ||
112 | |||
113 | switch (second.type_) | ||
114 | { | ||
115 | case type::singleton: | ||
116 | { | ||
117 | second.singleton_.pos = tempPos; | ||
118 | new(&second.singleton_.restriction) std::string(std::move(tempRestriction)); | ||
119 | |||
120 | break; | ||
121 | } | ||
122 | |||
123 | case type::group: | ||
124 | { | ||
125 | new(&second.group_.children) std::list<selrestr>(std::move(tempChildren)); | ||
126 | second.group_.orlogic = tempOrlogic; | ||
127 | |||
128 | break; | ||
129 | } | ||
130 | |||
131 | case type::empty: | ||
132 | { | ||
133 | break; | ||
134 | } | ||
135 | } | ||
136 | } | ||
137 | |||
138 | selrestr::~selrestr() | ||
139 | { | ||
140 | switch (type_) | ||
141 | { | ||
142 | case type::singleton: | ||
143 | { | ||
144 | using string_type = std::string; | ||
145 | singleton_.restriction.~string_type(); | ||
146 | |||
147 | break; | ||
148 | } | ||
149 | |||
150 | case type::group: | ||
151 | { | ||
152 | using list_type = std::list<selrestr>; | ||
153 | group_.children.~list_type(); | ||
154 | |||
155 | break; | ||
156 | } | ||
157 | |||
158 | case type::empty: | ||
159 | { | ||
160 | break; | ||
161 | } | ||
162 | } | ||
163 | } | ||
164 | |||
165 | selrestr::selrestr() : type_(type::empty) | ||
166 | { | ||
167 | } | ||
168 | |||
169 | selrestr::selrestr( | ||
170 | std::string restriction, | ||
171 | bool pos) : | ||
172 | type_(type::singleton) | ||
173 | { | ||
174 | new(&singleton_.restriction) std::string(std::move(restriction)); | ||
175 | singleton_.pos = pos; | ||
176 | } | ||
177 | |||
178 | std::string selrestr::getRestriction() const | ||
179 | { | ||
180 | if (type_ == type::singleton) | ||
181 | { | ||
182 | return singleton_.restriction; | ||
183 | } else { | ||
184 | throw std::domain_error("Only singleton selrestrs have restrictions"); | ||
185 | } | ||
186 | } | ||
187 | |||
188 | bool selrestr::getPos() const | ||
189 | { | ||
190 | if (type_ == type::singleton) | ||
191 | { | ||
192 | return singleton_.pos; | ||
193 | } else { | ||
194 | throw std::domain_error("Only singleton selrestrs have positivity flags"); | ||
195 | } | ||
196 | } | ||
197 | |||
198 | selrestr::selrestr( | ||
199 | std::list<selrestr> children, | ||
200 | bool orlogic) : | ||
201 | type_(type::group) | ||
202 | { | ||
203 | new(&group_.children) std::list<selrestr>(std::move(children)); | ||
204 | group_.orlogic = orlogic; | ||
205 | } | ||
206 | |||
207 | std::list<selrestr> selrestr::getChildren() const | ||
208 | { | ||
209 | if (type_ == type::group) | ||
210 | { | ||
211 | return group_.children; | ||
212 | } else { | ||
213 | throw std::domain_error("Only group selrestrs have children"); | ||
214 | } | ||
215 | } | ||
216 | |||
217 | std::list<selrestr>::const_iterator selrestr::begin() const | ||
218 | { | ||
219 | if (type_ == type::group) | ||
220 | { | ||
221 | return std::begin(group_.children); | ||
222 | } else { | ||
223 | throw std::domain_error("Only group selrestrs have children"); | ||
224 | } | ||
225 | } | ||
226 | |||
227 | std::list<selrestr>::const_iterator selrestr::end() const | ||
228 | { | ||
229 | if (type_ == type::group) | ||
230 | { | ||
231 | return std::end(group_.children); | ||
232 | } else { | ||
233 | throw std::domain_error("Only group selrestrs have children"); | ||
234 | } | ||
235 | } | ||
236 | |||
237 | bool selrestr::getOrlogic() const | ||
238 | { | ||
239 | if (type_ == type::group) | ||
240 | { | ||
241 | return group_.orlogic; | ||
242 | } else { | ||
243 | throw std::domain_error("Only group selrestrs have logic"); | ||
244 | } | ||
245 | } | ||
246 | |||
247 | nlohmann::json selrestr::toJson() const | ||
248 | { | ||
249 | switch (type_) | ||
250 | { | ||
251 | case type::empty: | ||
252 | { | ||
253 | return {}; | ||
254 | } | ||
255 | |||
256 | case type::singleton: | ||
257 | { | ||
258 | return { | ||
259 | {"type", singleton_.restriction}, | ||
260 | {"pos", singleton_.pos} | ||
261 | }; | ||
262 | } | ||
263 | |||
264 | case type::group: | ||
265 | { | ||
266 | std::string logic; | ||
267 | if (group_.orlogic) | ||
268 | { | ||
269 | logic = "or"; | ||
270 | } else { | ||
271 | logic = "and"; | ||
272 | } | ||
273 | |||
274 | std::list<nlohmann::json> children; | ||
275 | std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) { | ||
276 | return child.toJson(); | ||
277 | }); | ||
278 | |||
279 | return { | ||
280 | {"logic", logic}, | ||
281 | {"children", children} | ||
282 | }; | ||
283 | } | ||
284 | } | ||
285 | } | ||
286 | |||
287 | }; | ||
288 | }; | ||
diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h | |||
@@ -0,0 +1,88 @@ | |||
1 | #ifndef SELRESTR_H_50652FB7 | ||
2 | #define SELRESTR_H_50652FB7 | ||
3 | |||
4 | #include <list> | ||
5 | #include <string> | ||
6 | #include <json.hpp> | ||
7 | |||
8 | namespace verbly { | ||
9 | namespace generator { | ||
10 | |||
11 | class selrestr { | ||
12 | public: | ||
13 | enum class type { | ||
14 | empty, | ||
15 | singleton, | ||
16 | group | ||
17 | }; | ||
18 | |||
19 | // Copy and move constructors | ||
20 | |||
21 | selrestr(const selrestr& other); | ||
22 | selrestr(selrestr&& other); | ||
23 | |||
24 | // Assignment | ||
25 | |||
26 | selrestr& operator=(selrestr other); | ||
27 | |||
28 | // Swap | ||
29 | |||
30 | friend void swap(selrestr& first, selrestr& second); | ||
31 | |||
32 | // Destructor | ||
33 | |||
34 | ~selrestr(); | ||
35 | |||
36 | // Generic accessors | ||
37 | |||
38 | type getType() const | ||
39 | { | ||
40 | return type_; | ||
41 | } | ||
42 | |||
43 | // Empty | ||
44 | |||
45 | selrestr(); | ||
46 | |||
47 | // Singleton | ||
48 | |||
49 | selrestr(std::string restriction, bool pos); | ||
50 | |||
51 | std::string getRestriction() const; | ||
52 | |||
53 | bool getPos() const; | ||
54 | |||
55 | // Group | ||
56 | |||
57 | selrestr(std::list<selrestr> children, bool orlogic); | ||
58 | |||
59 | std::list<selrestr> getChildren() const; | ||
60 | |||
61 | std::list<selrestr>::const_iterator begin() const; | ||
62 | |||
63 | std::list<selrestr>::const_iterator end() const; | ||
64 | |||
65 | bool getOrlogic() const; | ||
66 | |||
67 | // Helpers | ||
68 | |||
69 | nlohmann::json toJson() const; | ||
70 | |||
71 | private: | ||
72 | union { | ||
73 | struct { | ||
74 | bool pos; | ||
75 | std::string restriction; | ||
76 | } singleton_; | ||
77 | struct { | ||
78 | std::list<selrestr> children; | ||
79 | bool orlogic; | ||
80 | } group_; | ||
81 | }; | ||
82 | type type_; | ||
83 | }; | ||
84 | |||
85 | }; | ||
86 | }; | ||
87 | |||
88 | #endif /* end of include guard: SELRESTR_H_50652FB7 */ | ||
diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp | |||
@@ -0,0 +1,77 @@ | |||
1 | #include "word.h" | ||
2 | #include <list> | ||
3 | #include <string> | ||
4 | #include "database.h" | ||
5 | #include "notion.h" | ||
6 | #include "lemma.h" | ||
7 | #include "field.h" | ||
8 | #include "group.h" | ||
9 | |||
10 | namespace verbly { | ||
11 | namespace generator { | ||
12 | |||
13 | int word::nextId_ = 0; | ||
14 | |||
15 | word::word( | ||
16 | notion& n, | ||
17 | lemma& l) : | ||
18 | id_(nextId_++), | ||
19 | notion_(n), | ||
20 | lemma_(l) | ||
21 | { | ||
22 | } | ||
23 | |||
24 | word::word( | ||
25 | notion& n, | ||
26 | lemma& l, | ||
27 | int tagCount) : | ||
28 | id_(nextId_++), | ||
29 | notion_(n), | ||
30 | lemma_(l), | ||
31 | tagCount_(tagCount), | ||
32 | hasTagCount_(true) | ||
33 | { | ||
34 | } | ||
35 | |||
36 | void word::setAdjectivePosition(positioning adjectivePosition) | ||
37 | { | ||
38 | adjectivePosition_ = adjectivePosition; | ||
39 | } | ||
40 | |||
41 | void word::setVerbGroup(const group& verbGroup) | ||
42 | { | ||
43 | verbGroup_ = &verbGroup; | ||
44 | } | ||
45 | |||
46 | database& operator<<(database& db, const word& arg) | ||
47 | { | ||
48 | std::list<field> fields; | ||
49 | |||
50 | fields.emplace_back("word_id", arg.getId()); | ||
51 | fields.emplace_back("notion_id", arg.getNotion().getId()); | ||
52 | fields.emplace_back("lemma_id", arg.getLemma().getId()); | ||
53 | |||
54 | if (arg.hasTagCount()) | ||
55 | { | ||
56 | fields.emplace_back("tag_count", arg.getTagCount()); | ||
57 | } | ||
58 | |||
59 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective) | ||
60 | && (arg.getAdjectivePosition() != positioning::undefined)) | ||
61 | { | ||
62 | fields.emplace_back("position", static_cast<int>(arg.getAdjectivePosition())); | ||
63 | } | ||
64 | |||
65 | if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb) | ||
66 | && (arg.hasVerbGroup())) | ||
67 | { | ||
68 | fields.emplace_back("group_id", arg.getVerbGroup().getId()); | ||
69 | } | ||
70 | |||
71 | db.insertIntoTable("words", std::move(fields)); | ||
72 | |||
73 | return db; | ||
74 | } | ||
75 | |||
76 | }; | ||
77 | }; | ||
diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h | |||
@@ -0,0 +1,110 @@ | |||
1 | #ifndef WORD_H_91F99D46 | ||
2 | #define WORD_H_91F99D46 | ||
3 | |||
4 | #include <cassert> | ||
5 | #include "enums.h" | ||
6 | |||
7 | namespace verbly { | ||
8 | namespace generator { | ||
9 | |||
10 | class notion; | ||
11 | class lemma; | ||
12 | class database; | ||
13 | class group; | ||
14 | |||
15 | class word { | ||
16 | public: | ||
17 | |||
18 | // Constructors | ||
19 | |||
20 | word(notion& n, lemma& l); | ||
21 | |||
22 | word(notion& n, lemma& l, int tagCount); | ||
23 | |||
24 | // Mutators | ||
25 | |||
26 | void setAdjectivePosition(positioning adjectivePosition); | ||
27 | |||
28 | void setVerbGroup(const group& verbGroup); | ||
29 | |||
30 | // Accessors | ||
31 | |||
32 | int getId() const | ||
33 | { | ||
34 | return id_; | ||
35 | } | ||
36 | |||
37 | notion& getNotion() | ||
38 | { | ||
39 | return notion_; | ||
40 | } | ||
41 | |||
42 | const notion& getNotion() const | ||
43 | { | ||
44 | return notion_; | ||
45 | } | ||
46 | |||
47 | lemma& getLemma() | ||
48 | { | ||
49 | return lemma_; | ||
50 | } | ||
51 | |||
52 | const lemma& getLemma() const | ||
53 | { | ||
54 | return lemma_; | ||
55 | } | ||
56 | |||
57 | bool hasTagCount() const | ||
58 | { | ||
59 | return hasTagCount_; | ||
60 | } | ||
61 | |||
62 | int getTagCount() const | ||
63 | { | ||
64 | // Calling code should always call hasTagCount first. | ||
65 | assert(hasTagCount_); | ||
66 | |||
67 | return tagCount_; | ||
68 | } | ||
69 | |||
70 | positioning getAdjectivePosition() const | ||
71 | { | ||
72 | return adjectivePosition_; | ||
73 | } | ||
74 | |||
75 | bool hasVerbGroup() const | ||
76 | { | ||
77 | return (verbGroup_ != nullptr); | ||
78 | } | ||
79 | |||
80 | const group& getVerbGroup() const | ||
81 | { | ||
82 | // Calling code should always call hasVerbGroup first. | ||
83 | assert(verbGroup_ != nullptr); | ||
84 | |||
85 | return *verbGroup_; | ||
86 | } | ||
87 | |||
88 | private: | ||
89 | |||
90 | static int nextId_; | ||
91 | |||
92 | const int id_; | ||
93 | notion& notion_; | ||
94 | lemma& lemma_; | ||
95 | const int tagCount_ = 0; | ||
96 | const bool hasTagCount_ = false; | ||
97 | |||
98 | positioning adjectivePosition_ = positioning::undefined; | ||
99 | const group* verbGroup_ = nullptr; | ||
100 | |||
101 | }; | ||
102 | |||
103 | // Serializer | ||
104 | |||
105 | database& operator<<(database& db, const word& arg); | ||
106 | |||
107 | }; | ||
108 | }; | ||
109 | |||
110 | #endif /* end of include guard: WORD_H_91F99D46 */ | ||