summary refs log tree commit diff stats
path: root/generator
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2017-01-16 18:02:50 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2017-01-16 18:02:50 -0500
commit6746da6edd7d9d50efe374eabbb79a3cac882d81 (patch)
treeff20917e08b08d36b9541c1371106596e7bec442 /generator
parent4af7e55733098ca42f75a4ffaca1b0f6bab4dd36 (diff)
downloadverbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.gz
verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.tar.bz2
verbly-6746da6edd7d9d50efe374eabbb79a3cac882d81.zip
Started structural rewrite
The new object structure was designed to build on the existing WordNet
structure, while also adding in all of the data that we get from other sources.
More information about this can be found on the project wiki.

The generator has already been completely rewritten to generate a
datafile that uses the new structure. In addition, a number of indexes
are created, which does double the size of the datafile, but also allows
for much faster lookups. Finally, the new generator is written modularly
and is a lot more readable than the old one.

The verbly interface to the new object structure has mostly been
completed, but has not been tested fully. There is a completely new
search API which utilizes a lot of operator overloading; documentation
on how to use it should go up at some point.

Token processing and verb frames are currently unimplemented. Source for
these have been left in the repository for now.
Diffstat (limited to 'generator')
-rw-r--r--generator/CMakeLists.txt6
-rw-r--r--generator/database.cpp173
-rw-r--r--generator/database.h73
-rw-r--r--generator/field.cpp193
-rw-r--r--generator/field.h76
-rw-r--r--generator/form.cpp53
-rw-r--r--generator/form.h71
-rw-r--r--generator/frame.cpp83
-rw-r--r--generator/frame.h59
-rw-r--r--generator/generator.cpp3145
-rw-r--r--generator/generator.h151
-rw-r--r--generator/group.cpp119
-rw-r--r--generator/group.h80
-rw-r--r--generator/lemma.cpp65
-rw-r--r--generator/lemma.h58
-rw-r--r--generator/main.cpp40
-rw-r--r--generator/notion.cpp85
-rw-r--r--generator/notion.h91
-rw-r--r--generator/part.cpp336
-rw-r--r--generator/part.h114
-rw-r--r--generator/progress.h78
-rw-r--r--generator/pronunciation.cpp87
-rw-r--r--generator/pronunciation.h82
-rw-r--r--generator/role.h35
-rw-r--r--generator/schema.sql352
-rw-r--r--generator/selrestr.cpp288
-rw-r--r--generator/selrestr.h88
-rw-r--r--generator/word.cpp77
-rw-r--r--generator/word.h110
29 files changed, 4018 insertions, 2250 deletions
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt
@@ -1,12 +1,12 @@
1cmake_minimum_required (VERSION 2.6) 1cmake_minimum_required (VERSION 3.1)
2project (generator) 2project (generator)
3 3
4find_package(PkgConfig) 4find_package(PkgConfig)
5pkg_check_modules(sqlite3 sqlite3 REQUIRED) 5pkg_check_modules(sqlite3 sqlite3 REQUIRED)
6find_package(libxml2 REQUIRED) 6find_package(libxml2 REQUIRED)
7 7
8include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) 8include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json)
9add_executable(generator generator.cpp) 9add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp)
10set_property(TARGET generator PROPERTY CXX_STANDARD 11) 10set_property(TARGET generator PROPERTY CXX_STANDARD 11)
11set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) 11set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON)
12target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) 12target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES})
diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp
@@ -0,0 +1,173 @@
1#include "database.h"
2#include <sqlite3.h>
3#include <cassert>
4#include <fstream>
5#include <stdexcept>
6#include <cstdio>
7#include <sstream>
8#include "field.h"
9#include "../lib/util.h"
10
11namespace verbly {
12 namespace generator {
13
14 sqlite3_error::sqlite3_error(
15 const std::string& what,
16 const std::string& db_err) :
17 what_(what + " (" + db_err + ")"),
18 db_err_(db_err)
19 {
20 }
21
22 const char* sqlite3_error::what() const noexcept
23 {
24 return what_.c_str();
25 }
26
27 const char* sqlite3_error::db_err() const noexcept
28 {
29 return db_err_.c_str();
30 }
31
32 database::database(std::string path)
33 {
34 // If there is already a file at this path, overwrite it.
35 if (std::ifstream(path))
36 {
37 if (std::remove(path.c_str()))
38 {
39 throw std::logic_error("Could not overwrite file at path");
40 }
41 }
42
43 if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
44 {
45 // We still have to free the resources allocated. In the event that
46 // allocation failed, ppdb will be null and sqlite3_close_v2 will just
47 // ignore it.
48 std::string errmsg(sqlite3_errmsg(ppdb_));
49 sqlite3_close_v2(ppdb_);
50
51 throw sqlite3_error("Could not create output datafile", errmsg);
52 }
53 }
54
55 database::database(database&& other) : database()
56 {
57 swap(*this, other);
58 }
59
60 database& database::operator=(database&& other)
61 {
62 swap(*this, other);
63
64 return *this;
65 }
66
67 void swap(database& first, database& second)
68 {
69 std::swap(first.ppdb_, second.ppdb_);
70 }
71
72 database::~database()
73 {
74 sqlite3_close_v2(ppdb_);
75 }
76
77 void database::runQuery(std::string query)
78 {
79 // This can only happen when doing bad things with move semantics.
80 assert(ppdb_ != nullptr);
81
82 sqlite3_stmt* ppstmt;
83
84 if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
85 {
86 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
87 }
88
89 int result = sqlite3_step(ppstmt);
90 sqlite3_finalize(ppstmt);
91
92 if (result != SQLITE_DONE)
93 {
94 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
95 }
96 }
97
98 void database::insertIntoTable(std::string table, std::list<field> fields)
99 {
100 // This can only happen when doing bad things with move semantics.
101 assert(ppdb_ != nullptr);
102
103 // This shouldn't happen.
104 assert(!fields.empty());
105
106 std::list<std::string> fieldNames;
107 std::list<std::string> qs;
108 for (field& f : fields)
109 {
110 fieldNames.push_back(f.getName());
111 qs.push_back("?");
112 }
113
114 std::ostringstream query;
115 query << "INSERT INTO ";
116 query << table;
117 query << " (";
118 query << implode(std::begin(fieldNames), std::end(fieldNames), ", ");
119 query << ") VALUES (";
120 query << implode(std::begin(qs), std::end(qs), ", ");
121 query << ")";
122
123 std::string query_str = query.str();
124
125 sqlite3_stmt* ppstmt;
126
127 if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK)
128 {
129 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
130 }
131
132 int i = 1;
133 for (field& f : fields)
134 {
135 switch (f.getType())
136 {
137 case field::type::integer:
138 {
139 sqlite3_bind_int(ppstmt, i, f.getInteger());
140
141 break;
142 }
143
144 case field::type::string:
145 {
146 sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT);
147
148 break;
149 }
150
151 case field::type::invalid:
152 {
153 // Fields can only be invalid when doing bad things with move semantics.
154 assert(false);
155
156 break;
157 }
158 }
159
160 i++;
161 }
162
163 int result = sqlite3_step(ppstmt);
164 sqlite3_finalize(ppstmt);
165
166 if (result != SQLITE_DONE)
167 {
168 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
169 }
170 }
171
172 };
173};
diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h
@@ -0,0 +1,73 @@
1#ifndef DATABASE_H_0B0A47D2
2#define DATABASE_H_0B0A47D2
3
4#include <string>
5#include <exception>
6#include <list>
7
8struct sqlite3;
9
10namespace verbly {
11 namespace generator {
12
13 class field;
14
15 class sqlite3_error : public std::exception {
16 public:
17
18 sqlite3_error(const std::string& what, const std::string& db_err);
19
20 const char* what() const noexcept override;
21 const char* db_err() const noexcept;
22
23 private:
24 std::string what_;
25 std::string db_err_;
26
27 };
28
29 class database {
30 public:
31
32 // Constructor
33
34 explicit database(std::string path);
35
36 // Disable copying
37
38 database(const database& other) = delete;
39 database& operator=(const database& other) = delete;
40
41 // Move constructor and move assignment
42
43 database(database&& other);
44 database& operator=(database&& other);
45
46 // Swap
47
48 friend void swap(database& first, database& second);
49
50 // Destructor
51
52 ~database();
53
54 // Actions
55
56 void runQuery(std::string query);
57
58 void insertIntoTable(std::string table, std::list<field> fields);
59
60 private:
61
62 database()
63 {
64 }
65
66 sqlite3* ppdb_ = nullptr;
67
68 };
69
70 };
71};
72
73#endif /* end of include guard: DATABASE_H_0B0A47D2 */
diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp
@@ -0,0 +1,193 @@
1#include "field.h"
2#include <stdexcept>
3#include <utility>
4
5namespace verbly {
6 namespace generator {
7
8 field::field(const field& other)
9 {
10 type_ = other.type_;
11 name_ = other.name_;
12
13 switch (type_)
14 {
15 case type::integer:
16 {
17 integer_ = other.integer_;
18
19 break;
20 }
21
22 case type::string:
23 {
24 new(&string_) std::string(other.string_);
25
26 break;
27 }
28
29 case type::invalid:
30 {
31 break;
32 }
33 }
34 }
35
36 field::field(field&& other) : field()
37 {
38 swap(*this, other);
39 }
40
41 field& field::operator=(field other)
42 {
43 swap(*this, other);
44
45 return *this;
46 }
47
48 void swap(field& first, field& second)
49 {
50 using type = field::type;
51
52 type tempType = first.type_;
53 std::string tempName = std::move(first.name_);
54 int tempInteger;
55 std::string tempString;
56
57 switch (first.type_)
58 {
59 case type::integer:
60 {
61 tempInteger = first.integer_;
62
63 break;
64 }
65
66 case type::string:
67 {
68 tempString = std::move(tempString);
69
70 break;
71 }
72
73 case type::invalid:
74 {
75 break;
76 }
77 }
78
79 first.~field();
80
81 first.type_ = second.type_;
82 first.name_ = std::move(second.name_);
83
84 switch (second.type_)
85 {
86 case type::integer:
87 {
88 first.integer_ = second.integer_;
89
90 break;
91 }
92
93 case type::string:
94 {
95 new(&first.string_) std::string(std::move(second.string_));
96
97 break;
98 }
99
100 case type::invalid:
101 {
102 break;
103 }
104 }
105
106 second.~field();
107
108 second.type_ = tempType;
109 second.name_ = std::move(tempName);
110
111 switch (tempType)
112 {
113 case type::integer:
114 {
115 second.integer_ = tempInteger;
116
117 break;
118 }
119
120 case type::string:
121 {
122 new(&second.string_) std::string(std::move(tempString));
123
124 break;
125 }
126
127 case type::invalid:
128 {
129 break;
130 }
131 }
132 }
133
134 field::~field()
135 {
136 switch (type_)
137 {
138 case type::string:
139 {
140 using string_type = std::string;
141 string_.~string_type();
142
143 break;
144 }
145
146 case type::integer:
147 case type::invalid:
148 {
149 break;
150 }
151 }
152 }
153
154 field::field(
155 std::string name,
156 int arg) :
157 type_(type::integer),
158 name_(name),
159 integer_(arg)
160 {
161 }
162
163 int field::getInteger() const
164 {
165 if (type_ != type::integer)
166 {
167 throw std::domain_error("field::getInteger called on non-integer field");
168 }
169
170 return integer_;
171 }
172
173 field::field(
174 std::string name,
175 std::string arg) :
176 type_(type::string),
177 name_(name)
178 {
179 new(&string_) std::string(arg);
180 }
181
182 std::string field::getString() const
183 {
184 if (type_ != type::string)
185 {
186 throw std::domain_error("field::getString called on non-string field");
187 }
188
189 return string_;
190 }
191
192 };
193};
diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h
@@ -0,0 +1,76 @@
1#ifndef BINDING_H_CAE0B18E
2#define BINDING_H_CAE0B18E
3
4#include <string>
5
6namespace verbly {
7 namespace generator {
8
9 class field {
10 public:
11 enum class type {
12 invalid,
13 integer,
14 string
15 };
16
17 // Copy and move constructors
18
19 field(const field& other);
20 field(field&& other);
21
22 // Assignment
23
24 field& operator=(field other);
25
26 // Swap
27
28 friend void swap(field& first, field& second);
29
30 // Destructor
31
32 ~field();
33
34 // Generic accessors
35
36 type getType() const
37 {
38 return type_;
39 }
40
41 std::string getName() const
42 {
43 return name_;
44 }
45
46 // Integer
47
48 field(std::string name, int arg);
49
50 int getInteger() const;
51
52 // String
53
54 field(std::string name, std::string arg);
55
56 std::string getString() const;
57
58 private:
59
60 field()
61 {
62 }
63
64 union {
65 int integer_;
66 std::string string_;
67 };
68
69 type type_ = type::invalid;
70 std::string name_;
71 };
72
73 };
74};
75
76#endif /* end of include guard: BINDING_H_CAE0B18E */
diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp
@@ -0,0 +1,53 @@
1#include "form.h"
2#include <algorithm>
3#include <list>
4#include "database.h"
5#include "field.h"
6#include "pronunciation.h"
7
8namespace verbly {
9 namespace generator {
10
11 int form::nextId_ = 0;
12
13 form::form(std::string text) :
14 id_(nextId_++),
15 text_(text),
16 complexity_(std::count(std::begin(text), std::end(text), ' ') + 1),
17 proper_(std::any_of(std::begin(text), std::end(text), std::isupper))
18 {
19 }
20
21 void form::addPronunciation(const pronunciation& p)
22 {
23 pronunciations_.insert(&p);
24 }
25
26 database& operator<<(database& db, const form& arg)
27 {
28 // Serialize the form first.
29 {
30 std::list<field> fields;
31 fields.emplace_back("form_id", arg.getId());
32 fields.emplace_back("form", arg.getText());
33 fields.emplace_back("complexity", arg.getComplexity());
34 fields.emplace_back("proper", arg.isProper());
35
36 db.insertIntoTable("forms", std::move(fields));
37 }
38
39 // Then, serialize the form/pronunciation relationship.
40 for (const pronunciation* p : arg.getPronunciations())
41 {
42 std::list<field> fields;
43 fields.emplace_back("form_id", arg.getId());
44 fields.emplace_back("pronunciation_id", p->getId());
45
46 db.insertIntoTable("forms_pronunciations", std::move(fields));
47 }
48
49 return db;
50 }
51
52 };
53};
diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h
@@ -0,0 +1,71 @@
1#ifndef FORM_H_7EFBC970
2#define FORM_H_7EFBC970
3
4#include <string>
5#include <set>
6
7namespace verbly {
8 namespace generator {
9
10 class pronunciation;
11 class database;
12
13 class form {
14 public:
15
16 // Constructor
17
18 explicit form(std::string text);
19
20 // Mutators
21
22 void addPronunciation(const pronunciation& p);
23
24 // Accessors
25
26 int getId() const
27 {
28 return id_;
29 }
30
31 std::string getText() const
32 {
33 return text_;
34 }
35
36 int getComplexity() const
37 {
38 return complexity_;
39 }
40
41 bool isProper() const
42 {
43 return proper_;
44 }
45
46 std::set<const pronunciation*> getPronunciations() const
47 {
48 return pronunciations_;
49 }
50
51 private:
52
53 static int nextId_;
54
55 const int id_;
56 const std::string text_;
57 const int complexity_;
58 const bool proper_;
59
60 std::set<const pronunciation*> pronunciations_;
61
62 };
63
64 // Serializer
65
66 database& operator<<(database& db, const form& arg);
67
68 };
69};
70
71#endif /* end of include guard: FORM_H_7EFBC970 */
diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp
@@ -0,0 +1,83 @@
1#include "frame.h"
2#include "database.h"
3#include "field.h"
4
5namespace verbly {
6 namespace generator {
7
8 int frame::nextId_ = 0;
9
10 frame::frame() : id_(nextId_++)
11 {
12 }
13
14 void frame::push_back(part fp)
15 {
16 parts_.push_back(std::move(fp));
17 }
18
19 database& operator<<(database& db, const frame& arg)
20 {
21 std::list<field> fields;
22 fields.emplace_back("frame_id", arg.getId());
23
24 nlohmann::json jsonParts;
25 for (const part& p : arg)
26 {
27 nlohmann::json jsonPart;
28 jsonPart["type"] = static_cast<int>(p.getType());
29
30 switch (p.getType())
31 {
32 case part::type::noun_phrase:
33 {
34 jsonPart["role"] = p.getNounRole();
35 jsonPart["selrestrs"] = p.getNounSelrestrs().toJson();
36 jsonPart["synrestrs"] = p.getNounSynrestrs();
37
38 break;
39 }
40
41 case part::type::preposition:
42 {
43 jsonPart["choices"] = p.getPrepositionChoices();
44 jsonPart["literal"] = p.isPrepositionLiteral();
45
46 break;
47 }
48
49 case part::type::literal:
50 {
51 jsonPart["value"] = p.getLiteralValue();
52
53 break;
54 }
55
56 case part::type::verb:
57 case part::type::adjective:
58 case part::type::adverb:
59 {
60 break;
61 }
62
63 case part::type::invalid:
64 {
65 // Invalid parts should not be serialized.
66 assert(false);
67
68 break;
69 }
70 }
71
72 jsonParts.emplace_back(std::move(jsonPart));
73 }
74
75 fields.emplace_back("data", jsonParts.dump());
76
77 db.insertIntoTable("frames", std::move(fields));
78
79 return db;
80 }
81
82 };
83};
diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h
@@ -0,0 +1,59 @@
1#ifndef FRAME_H_26770FF1
2#define FRAME_H_26770FF1
3
4#include <list>
5#include "part.h"
6
7namespace verbly {
8 namespace generator {
9
10 class database;
11
12 class frame {
13 public:
14
15 // Aliases
16
17 using const_iterator = std::list<part>::const_iterator;
18
19 // Constructor
20
21 frame();
22
23 // Mutators
24
25 void push_back(part fp);
26
27 // Accessors
28
29 int getId() const
30 {
31 return id_;
32 }
33
34 const_iterator begin() const
35 {
36 return std::begin(parts_);
37 }
38
39 const_iterator end() const
40 {
41 return std::end(parts_);
42 }
43
44 private:
45
46 static int nextId_;
47
48 const int id_;
49
50 std::list<part> parts_;
51
52 };
53
54 database& operator<<(database& db, const frame& arg);
55
56 };
57};
58
59#endif /* end of include guard: FRAME_H_26770FF1 */
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,2320 +1,1477 @@
1#include <libxml/parser.h> 1#include "generator.h"
2#include <cassert>
3#include <stdexcept>
2#include <iostream> 4#include <iostream>
5#include <regex>
3#include <dirent.h> 6#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream> 7#include <fstream>
9#include <sqlite3.h> 8#include "enums.h"
10#include <sstream>
11#include <regex>
12#include <list>
13#include <algorithm>
14#include <json.hpp>
15#include "progress.h" 9#include "progress.h"
10#include "selrestr.h"
11#include "role.h"
12#include "part.h"
13#include "field.h"
16#include "../lib/util.h" 14#include "../lib/util.h"
17 15
18using json = nlohmann::json; 16namespace verbly {
19 17 namespace generator {
20struct verb_t {
21 std::string infinitive;
22 std::string past_tense;
23 std::string past_participle;
24 std::string ing_form;
25 std::string s_form;
26 int id;
27};
28
29struct adjective_t {
30 std::string base;
31 std::string comparative;
32 std::string superlative;
33};
34
35struct noun_t {
36 std::string singular;
37 std::string plural;
38};
39
40struct selrestr_t {
41 enum class type_t {
42 singleton,
43 andlogic,
44 orlogic,
45 empty
46 };
47 type_t type;
48 std::string restriction;
49 bool pos;
50 std::list<selrestr_t> subordinates;
51};
52
53struct framepart_t {
54 enum class type_t {
55 np,
56 v,
57 pp,
58 adj,
59 adv,
60 lex
61 };
62 type_t type;
63 std::string role;
64 selrestr_t selrestrs;
65 std::set<std::string> preprestrs;
66 std::set<std::string> synrestrs;
67 std::list<std::string> choices;
68 std::string lexval;
69};
70
71struct group_t {
72 std::string id;
73 std::string parent;
74 std::set<std::string> members;
75 std::map<std::string, selrestr_t> roles;
76 std::list<std::list<framepart_t>> frames;
77};
78
79struct pronunciation_t {
80 std::string phonemes;
81 std::string prerhyme;
82 std::string rhyme;
83 int syllables = 0;
84 std::string stress;
85
86 bool operator<(const pronunciation_t& other) const
87 {
88 return phonemes < other.phonemes;
89 }
90};
91
92std::map<std::string, group_t> groups;
93std::map<std::string, verb_t> verbs;
94std::map<std::string, adjective_t> adjectives;
95std::map<std::string, noun_t> nouns;
96std::map<int, std::map<int, int>> wn;
97std::map<int, int> images;
98std::map<std::string, std::set<pronunciation_t>> pronunciations;
99
100void print_usage()
101{
102 std::cout << "Verbly Datafile Generator" << std::endl;
103 std::cout << "-------------------------" << std::endl;
104 std::cout << "Requires exactly six arguments." << std::endl;
105 std::cout << "1. The path to a VerbNet data directory." << std::endl;
106 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
107 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
108 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
109 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
110 std::cout << "6. Datafile output path." << std::endl;
111
112 exit(1);
113}
114
115void db_error(sqlite3* ppdb, std::string query)
116{
117 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
118 std::cout << query << std::endl;
119 sqlite3_close_v2(ppdb);
120 print_usage();
121}
122
123json export_selrestrs(selrestr_t r)
124{
125 if (r.type == selrestr_t::type_t::empty)
126 {
127 return {};
128 } else if (r.type == selrestr_t::type_t::singleton)
129 {
130 json result;
131 result["type"] = r.restriction;
132 result["pos"] = r.pos;
133 return result;
134 } else {
135 json result;
136 if (r.type == selrestr_t::type_t::andlogic)
137 {
138 result["logic"] = "and";
139 } else {
140 result["logic"] = "or";
141 }
142
143 std::list<json> outlist;
144 std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs);
145 result["children"] = outlist;
146 18
147 return result; 19 generator::generator(
148 } 20 std::string verbNetPath,
149} 21 std::string agidPath,
150 22 std::string wordNetPath,
151selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) 23 std::string cmudictPath,
152{ 24 std::string imageNetPath,
153 selrestr_t r; 25 std::string outputPath) :
154 xmlChar* key; 26 verbNetPath_(verbNetPath),
155 27 agidPath_(agidPath),
156 if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) 28 wordNetPath_(wordNetPath),
157 { 29 cmudictPath_(cmudictPath),
158 if (xmlChildElementCount(top) == 0) 30 imageNetPath_(imageNetPath),
31 db_(outputPath)
159 { 32 {
160 r.type = selrestr_t::type_t::empty; 33 // Ensure VerbNet directory exists
161 } else if (xmlChildElementCount(top) == 1) 34 DIR* dir;
162 { 35 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
163 r = parse_selrestrs(xmlFirstElementChild(top), filename);
164 } else {
165 r.type = selrestr_t::type_t::andlogic;
166
167 if (xmlHasProp(top, (const xmlChar*) "logic"))
168 { 36 {
169 key = xmlGetProp(top, (const xmlChar*) "logic"); 37 throw std::invalid_argument("Invalid VerbNet data directory");
170 if (!xmlStrcmp(key, (const xmlChar*) "or"))
171 {
172 r.type = selrestr_t::type_t::orlogic;
173 }
174 xmlFree(key);
175 } 38 }
176 39
177 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) 40 closedir(dir);
41
42 // Ensure AGID infl.txt exists
43 if (!std::ifstream(agidPath_))
178 { 44 {
179 if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) 45 throw std::invalid_argument("AGID infl.txt file not found");
180 {
181 r.subordinates.push_back(parse_selrestrs(selrestr, filename));
182 }
183 } 46 }
184 } 47
185 } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) 48 // Add directory separator to WordNet path
186 { 49 if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\'))
187 r.type = selrestr_t::type_t::singleton;
188
189 key = xmlGetProp(top, (xmlChar*) "Value");
190 r.pos = (std::string((const char*)key) == "+");
191 xmlFree(key);
192
193 key = xmlGetProp(top, (xmlChar*) "type");
194 r.restriction = (const char*) key;
195 xmlFree(key);
196 } else {
197 // Invalid
198 std::cout << "Bad VerbNet file format: " << filename << std::endl;
199 print_usage();
200 }
201
202 return r;
203}
204
205group_t& parse_group(xmlNodePtr top, std::string filename)
206{
207 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
208 if (key == 0)
209 {
210 std::cout << "Bad VerbNet file format: " << filename << std::endl;
211 print_usage();
212 }
213 std::string vnid = (const char*)key;
214 vnid = vnid.substr(vnid.find_first_of("-")+1);
215 xmlFree(key);
216
217 group_t g;
218 g.id = vnid;
219
220 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
221 {
222 if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES"))
223 {
224 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
225 { 50 {
226 if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) 51 wordNetPath_ += '/';
227 {
228 auto& sg = parse_group(subclass, filename);
229 sg.parent = vnid;
230
231 for (auto member : sg.members)
232 {
233 g.members.insert(member);
234 }
235
236 // The schema requires that subclasses appear after role definitions, so we can do this now
237 for (auto role : g.roles)
238 {
239 if (sg.roles.count(role.first) == 0)
240 {
241 sg.roles[role.first] = role.second;
242 }
243 }
244 }
245 } 52 }
246 } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) 53
247 { 54 // Ensure WordNet tables exist
248 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) 55 for (std::string table : {
56 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"
57 })
249 { 58 {
250 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) 59 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl"))
251 { 60 {
252 key = xmlGetProp(member, (xmlChar*) "name"); 61 throw std::invalid_argument("WordNet " + table + " table not found");
253 g.members.insert((const char*)key);
254 xmlFree(key);
255 } 62 }
256 } 63 }
257 } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) 64
258 { 65 // Ensure CMUDICT file exists
259 for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) 66 if (!std::ifstream(cmudictPath_))
260 { 67 {
261 if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) 68 throw std::invalid_argument("CMUDICT file not found");
262 {
263 selrestr_t r;
264 r.type = selrestr_t::type_t::empty;
265
266 key = xmlGetProp(role, (const xmlChar*) "type");
267 std::string type = (const char*)key;
268 xmlFree(key);
269
270 for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
271 {
272 if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS"))
273 {
274 r = parse_selrestrs(rolenode, filename);
275 }
276 }
277
278 g.roles[type] = r;
279 }
280 } 69 }
281 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) 70
282 { 71 // Ensure ImageNet urls.txt exists
283 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) 72 if (!std::ifstream(imageNetPath_))
284 { 73 {
285 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) 74 throw std::invalid_argument("ImageNet urls.txt file not found");
286 {
287 std::list<framepart_t> f;
288
289 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
290 {
291 if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX"))
292 {
293 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
294 {
295 framepart_t fp;
296
297 if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP"))
298 {
299 fp.type = framepart_t::type_t::np;
300
301 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
302 fp.role = (const char*)key;
303 xmlFree(key);
304
305 fp.selrestrs.type = selrestr_t::type_t::empty;
306
307 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
308 {
309 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS"))
310 {
311 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
312 {
313 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR"))
314 {
315 key = xmlGetProp(synrestr, (xmlChar*) "type");
316 fp.synrestrs.insert(std::string((const char*)key));
317 xmlFree(key);
318 }
319 }
320 }
321
322 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
323 {
324 fp.selrestrs = parse_selrestrs(npnode, filename);
325 }
326 }
327 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB"))
328 {
329 fp.type = framepart_t::type_t::v;
330 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP"))
331 {
332 fp.type = framepart_t::type_t::pp;
333
334 if (xmlHasProp(syntaxnode, (xmlChar*) "value"))
335 {
336 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
337 std::string choices = (const char*)key;
338 xmlFree(key);
339
340 fp.choices = verbly::split<std::list<std::string>>(choices, " ");
341 }
342
343 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
344 {
345 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
346 {
347 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
348 {
349 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR"))
350 {
351 key = xmlGetProp(synrestr, (xmlChar*) "type");
352 fp.preprestrs.insert(std::string((const char*)key));
353 xmlFree(key);
354 }
355 }
356 }
357 }
358 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ"))
359 {
360 fp.type = framepart_t::type_t::adj;
361 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV"))
362 {
363 fp.type = framepart_t::type_t::adv;
364 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX"))
365 {
366 fp.type = framepart_t::type_t::lex;
367
368 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
369 fp.lexval = (const char*)key;
370 xmlFree(key);
371 } else {
372 continue;
373 }
374
375 f.push_back(fp);
376 }
377
378 g.frames.push_back(f);
379 }
380 }
381 }
382 } 75 }
383 } 76 }
384 }
385
386 groups[vnid] = g;
387
388 return groups[vnid];
389}
390
391int main(int argc, char** argv)
392{
393 if (argc != 7)
394 {
395 print_usage();
396 }
397
398 // VerbNet data
399 std::cout << "Reading verb frames..." << std::endl;
400
401 DIR* dir;
402 if ((dir = opendir(argv[1])) == nullptr)
403 {
404 std::cout << "Invalid VerbNet data directory." << std::endl;
405
406 print_usage();
407 }
408
409 struct dirent* ent;
410 while ((ent = readdir(dir)) != nullptr)
411 {
412 std::string filename(argv[1]);
413 if (filename.back() != '/')
414 {
415 filename += '/';
416 }
417 77
418 filename += ent->d_name; 78 void generator::run()
419 //std::cout << ent->d_name << std::endl;
420
421 if (filename.rfind(".xml") != filename.size() - 4)
422 {
423 continue;
424 }
425
426 xmlDocPtr doc = xmlParseFile(filename.c_str());
427 if (doc == nullptr)
428 {
429 std::cout << "Error opening " << filename << std::endl;
430 print_usage();
431 }
432
433 xmlNodePtr top = xmlDocGetRootElement(doc);
434 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
435 {
436 std::cout << "Bad VerbNet file format: " << filename << std::endl;
437 print_usage();
438 }
439
440 parse_group(top, filename);
441 }
442
443 closedir(dir);
444
445 // Get verbs from AGID
446 std::cout << "Reading inflections..." << std::endl;
447
448 std::ifstream agidfile(argv[2]);
449 if (!agidfile.is_open())
450 {
451 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
452 print_usage();
453 }
454
455 for (;;)
456 {
457 std::string line;
458 if (!getline(agidfile, line))
459 {
460 break;
461 }
462
463 if (line.back() == '\r')
464 { 79 {
465 line.pop_back(); 80 // Create notions, words, lemmas, and forms from WordNet synsets
466 } 81 readWordNetSynsets();
467 82
468 int divider = line.find_first_of(" "); 83 // Reads adjective positioning WordNet data
469 std::string word = line.substr(0, divider); 84 readAdjectivePositioning();
470 line = line.substr(divider+1); 85
471 char type = line[0]; 86 // Counts the number of URLs ImageNet has per notion
472 87 readImageNetUrls();
473 if (line[1] == '?') 88
474 { 89 // Creates a word by WordNet sense key lookup table
475 line.erase(0, 4); 90 readWordNetSenseKeys();
476 } else { 91
477 line.erase(0, 3); 92 // Creates groups and frames from VerbNet data
478 } 93 readVerbNet();
479 94
480 std::vector<std::string> forms; 95 // Creates forms and inflections from AGID. To reduce the amount of forms
481 while (!line.empty()) 96 // created, we do this after most lemmas that need inflecting have been
482 { 97 // created through other means, and then only generate forms for
483 std::string inflection; 98 // inflections of already-existing lemmas. The exception to this regards
484 if ((divider = line.find(" | ")) != std::string::npos) 99 // verb lemmas. If a verb lemma in AGID either does not exist yet, or does
485 { 100 // exist but is not related to any words that are related to verb notions,
486 inflection = line.substr(0, divider); 101 // then a notion and a word is generated and the form generation proceeds
487 line = line.substr(divider + 3); 102 // as usual.
488 } else { 103 readAgidInflections();
489 inflection = line; 104
490 line = ""; 105 // Reads in prepositions and the is_a relationship
491 } 106 readPrepositions();
492 107
493 if ((divider = inflection.find_first_of(",?")) != std::string::npos) 108 // Creates pronunciations from CMUDICT. To reduce the amount of
494 { 109 // pronunciations created, we do this after all forms have been created,
495 inflection = inflection.substr(0, divider); 110 // and then only generate pronunciations for already-exisiting forms.
496 } 111 readCmudictPronunciations();
497 112
498 forms.push_back(inflection); 113 // Writes the database schema
114 writeSchema();
115
116 // Dumps data to the database
117 dumpObjects();
118
119 // Populates the antonymy relationship from WordNet
120 readWordNetAntonymy();
121
122 // Populates the variation relationship from WordNet
123 readWordNetVariation();
124
125 // Populates the usage, topicality, and regionality relationships from
126 // WordNet
127 readWordNetClasses();
128
129 // Populates the causality relationship from WordNet
130 readWordNetCausality();
131
132 // Populates the entailment relationship from WordNet
133 readWordNetEntailment();
134
135 // Populates the hypernymy relationship from WordNet
136 readWordNetHypernymy();
137
138 // Populates the instantiation relationship from WordNet
139 readWordNetInstantiation();
140
141 // Populates the member meronymy relationship from WordNet
142 readWordNetMemberMeronymy();
143
144 // Populates the part meronymy relationship from WordNet
145 readWordNetPartMeronymy();
146
147 // Populates the substance meronymy relationship from WordNet
148 readWordNetSubstanceMeronymy();
149
150 // Populates the pertainymy and mannernymy relationships from WordNet
151 readWordNetPertainymy();
152
153 // Populates the specification relationship from WordNet
154 readWordNetSpecification();
155
156 // Populates the adjective similarity relationship from WordNet
157 readWordNetSimilarity();
158
159
160
161
162
163
164
165
499 } 166 }
500 167
501 switch (type) 168 void generator::readWordNetSynsets()
502 { 169 {
503 case 'V': 170 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
171 progress ppgs("Reading synsets from WordNet...", lines.size());
172
173 for (std::string line : lines)
504 { 174 {
505 verb_t v; 175 ppgs.update();
506 v.infinitive = word; 176
507 if (forms.size() == 4) 177 std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
508 { 178 std::smatch relation_data;
509 v.past_tense = forms[0]; 179 if (!std::regex_search(line, relation_data, relation))
510 v.past_participle = forms[1]; 180 {
511 v.ing_form = forms[2]; 181 continue;
512 v.s_form = forms[3];
513 } else if (forms.size() == 3)
514 {
515 v.past_tense = forms[0];
516 v.past_participle = forms[0];
517 v.ing_form = forms[1];
518 v.s_form = forms[2];
519 } else if (forms.size() == 8)
520 {
521 // As of AGID 2014.08.11, this is only "to be"
522 v.past_tense = forms[0];
523 v.past_participle = forms[2];
524 v.ing_form = forms[3];
525 v.s_form = forms[4];
526 } else {
527 // Words that don't fit the cases above as of AGID 2014.08.11:
528 // - may and shall do not conjugate the way we want them to
529 // - methinks only has a past tense and is an outlier
530 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
531 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
532 } 182 }
533 183
534 verbs[word] = v; 184 int synset_id = std::stoi(relation_data[1]);
535 185 int wnum = std::stoi(relation_data[2]);
536 break; 186 std::string text = relation_data[3];
537 } 187 int tag_count = std::stoi(relation_data[4]);
538 188 size_t word_it;
539 case 'A': 189 while ((word_it = text.find("''")) != std::string::npos)
540 {
541 adjective_t adj;
542 adj.base = word;
543 if (forms.size() == 2)
544 { 190 {
545 adj.comparative = forms[0]; 191 text.erase(word_it, 1);
546 adj.superlative = forms[1];
547 } else {
548 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
549 std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl;
550 } 192 }
551 193
552 adjectives[word] = adj; 194 // The WordNet data does contain duplicates, so we need to check that we
553 195 // haven't already created this word.
554 break; 196 std::pair<int, int> lookup(synset_id, wnum);
555 } 197 if (!wordByWnidAndWnum_.count(lookup))
556
557 case 'N':
558 {
559 noun_t n;
560 n.singular = word;
561 if (forms.size() == 1)
562 { 198 {
563 n.plural = forms[0]; 199 notion& synset = lookupOrCreateNotion(synset_id);
564 } else { 200 lemma& lex = lookupOrCreateLemma(text);
565 // As of AGID 2014.08.11, this is non-existent. 201 word& entry = createWord(synset, lex, tag_count);
566 std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; 202
203 wordByWnidAndWnum_[lookup] = &entry;
567 } 204 }
568
569 nouns[word] = n;
570
571 break;
572 } 205 }
573 } 206 }
574 }
575
576 // Pronounciations
577 std::cout << "Reading pronunciations..." << std::endl;
578
579 std::ifstream pronfile(argv[4]);
580 if (!pronfile.is_open())
581 {
582 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
583 print_usage();
584 }
585
586 for (;;)
587 {
588 std::string line;
589 if (!getline(pronfile, line))
590 {
591 break;
592 }
593
594 if (line.back() == '\r')
595 {
596 line.pop_back();
597 }
598 207
599 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); 208 void generator::readAdjectivePositioning()
600 std::smatch phoneme_data;
601 if (std::regex_search(line, phoneme_data, phoneme))
602 { 209 {
603 std::string canonical(phoneme_data[1]); 210 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
604 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); 211 progress ppgs("Reading adjective positionings from WordNet...", lines.size());
605
606 std::string phonemes = phoneme_data[2];
607 auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " ");
608 auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) {
609 return phoneme.find("1") != std::string::npos;
610 });
611 212
612 pronunciation_t p; 213 for (std::string line : lines)
613 p.phonemes = phonemes;
614
615 // Rhyme detection
616 if (phemstrt != std::end(phoneme_set))
617 { 214 {
618 std::stringstream rhymer; 215 ppgs.update();
619 for (auto it = phemstrt; it != std::end(phoneme_set); it++)
620 {
621 std::string naked;
622 std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) {
623 return isdigit(ch);
624 });
625
626 if (it != phemstrt)
627 {
628 rhymer << " ";
629 }
630
631 rhymer << naked;
632 }
633 216
634 p.rhyme = rhymer.str(); 217 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
635 218 std::smatch relation_data;
636 if (phemstrt != std::begin(phoneme_set)) 219 if (!std::regex_search(line, relation_data, relation))
637 { 220 {
638 phemstrt--; 221 continue;
639 p.prerhyme = *phemstrt;
640 } else {
641 p.prerhyme = "";
642 } 222 }
643 } else {
644 p.prerhyme = "";
645 p.rhyme = "";
646 }
647 223
648 // Syllable/stress 224 int synset_id = stoi(relation_data[1]);
649 for (auto phm : phoneme_set) 225 int wnum = stoi(relation_data[2]);
650 { 226 std::string adjpos_str = relation_data[3];
651 if (isdigit(phm.back()))
652 {
653 // It's a vowel!
654 p.syllables++;
655 227
656 if (phm.back() == '1') 228 std::pair<int, int> lookup(synset_id, wnum);
229 if (wordByWnidAndWnum_.count(lookup))
230 {
231 word& adj = *wordByWnidAndWnum_.at(lookup);
232
233 if (adjpos_str == "p")
234 {
235 adj.setAdjectivePosition(positioning::predicate);
236 } else if (adjpos_str == "a")
237 {
238 adj.setAdjectivePosition(positioning::attributive);
239 } else if (adjpos_str == "i")
657 { 240 {
658 p.stress.push_back('1'); 241 adj.setAdjectivePosition(positioning::postnominal);
659 } else { 242 } else {
660 p.stress.push_back('0'); 243 // Can't happen because of how we specified the regex.
244 assert(false);
661 } 245 }
662 } 246 }
663 } 247 }
664
665 pronunciations[canonical].insert(p);
666 }
667 }
668
669 // Images
670 std::cout << "Reading images..." << std::endl;
671
672 std::ifstream imagefile(argv[5]);
673 if (!imagefile.is_open())
674 {
675 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
676 print_usage();
677 }
678
679 for (;;)
680 {
681 std::string line;
682 if (!getline(imagefile, line))
683 {
684 break;
685 }
686
687 if (line.back() == '\r')
688 {
689 line.pop_back();
690 }
691
692 std::string wnid_s = line.substr(1, 8);
693 int wnid = stoi(wnid_s) + 100000000;
694 images[wnid]++;
695 }
696
697 imagefile.close();
698
699 // Start writing output
700 std::cout << "Writing schema..." << std::endl;
701
702 sqlite3* ppdb;
703 if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
704 {
705 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
706 print_usage();
707 }
708
709 std::ifstream schemafile("schema.sql");
710 if (!schemafile.is_open())
711 {
712 std::cout << "Could not find schema file" << std::endl;
713 print_usage();
714 }
715
716 std::stringstream schemabuilder;
717 for (;;)
718 {
719 std::string line;
720 if (!getline(schemafile, line))
721 {
722 break;
723 }
724
725 if (line.back() == '\r')
726 {
727 line.pop_back();
728 }
729
730 schemabuilder << line << std::endl;
731 }
732
733 std::string schema = schemabuilder.str();
734 while (!schema.empty())
735 {
736 std::string query;
737 int divider = schema.find(";");
738 if (divider != std::string::npos)
739 {
740 query = schema.substr(0, divider+1);
741 schema = schema.substr(divider+2);
742 } else {
743 break;
744 } 248 }
745 249
746 sqlite3_stmt* schmstmt; 250 void generator::readImageNetUrls()
747 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
748 { 251 {
749 db_error(ppdb, query); 252 // The ImageNet datafile is so large that it is unreasonable and
750 } 253 // unnecessary to read it into memory; instead, we will parse each line as
751 254 // we read it. This has the caveat that we cannot display a progress bar.
752 if (sqlite3_step(schmstmt) != SQLITE_DONE) 255 std::cout << "Reading image counts from ImageNet..." << std::endl;
753 {
754 db_error(ppdb, query);
755 }
756
757 sqlite3_finalize(schmstmt);
758 }
759
760 std::cout << "Writing prepositions..." << std::endl;
761 std::ifstream prepfile("prepositions.txt");
762 if (!prepfile.is_open())
763 {
764 std::cout << "Could not find prepositions file" << std::endl;
765 print_usage();
766 }
767
768 for (;;)
769 {
770 std::string line;
771 if (!getline(prepfile, line))
772 {
773 break;
774 }
775
776 if (line.back() == '\r')
777 {
778 line.pop_back();
779 }
780
781 std::regex relation("^([^:]+): (.+)");
782 std::smatch relation_data;
783 std::regex_search(line, relation_data, relation);
784 std::string prep = relation_data[1];
785 std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", ");
786
787 std::string query("INSERT INTO prepositions (form) VALUES (?)");
788 sqlite3_stmt* ppstmt;
789
790 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
791 {
792 db_error(ppdb, query);
793 }
794
795 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT);
796
797 if (sqlite3_step(ppstmt) != SQLITE_DONE)
798 {
799 db_error(ppdb, query);
800 }
801
802 sqlite3_finalize(ppstmt);
803
804 query = "SELECT last_insert_rowid()";
805 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
806 {
807 db_error(ppdb, query);
808 }
809
810 if (sqlite3_step(ppstmt) != SQLITE_ROW)
811 {
812 db_error(ppdb, query);
813 }
814
815 int rowid = sqlite3_column_int(ppstmt, 0);
816 sqlite3_finalize(ppstmt);
817
818 for (auto group : groups)
819 {
820 query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)";
821 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
822 {
823 db_error(ppdb, query);
824 }
825 256
826 sqlite3_bind_int(ppstmt, 1, rowid); 257 std::ifstream file(imageNetPath_);
827 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); 258 if (!file)
828
829 if (sqlite3_step(ppstmt) != SQLITE_DONE)
830 { 259 {
831 db_error(ppdb, query); 260 throw std::invalid_argument("Could not find file " + imageNetPath_);
832 } 261 }
833
834 sqlite3_finalize(ppstmt);
835 }
836 }
837
838 262
839 { 263 std::string line;
840 progress ppgs("Writing verbs...", verbs.size()); 264 while (std::getline(file, line))
841 for (auto& mapping : verbs)
842 {
843 sqlite3_stmt* ppstmt;
844 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
845 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
846 {
847 db_error(ppdb, query);
848 }
849
850 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT);
851 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT);
852 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT);
853 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT);
854 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT);
855
856 if (sqlite3_step(ppstmt) != SQLITE_DONE)
857 {
858 db_error(ppdb, query);
859 }
860
861 sqlite3_finalize(ppstmt);
862
863 std::string canonical(mapping.second.infinitive);
864 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
865 if (pronunciations.count(canonical) == 1)
866 { 265 {
867 query = "SELECT last_insert_rowid()"; 266 if (line.back() == '\r')
868 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
869 { 267 {
870 db_error(ppdb, query); 268 line.pop_back();
871 } 269 }
872 270
873 if (sqlite3_step(ppstmt) != SQLITE_ROW) 271 std::string wnid_s = line.substr(1, 8);
272 int wnid = stoi(wnid_s) + 100000000;
273 if (notionByWnid_.count(wnid))
874 { 274 {
875 db_error(ppdb, query); 275 // We know that this notion has a wnid and is a noun.
876 } 276 notionByWnid_.at(wnid)->incrementNumOfImages();
877
878 int rowid = sqlite3_column_int(ppstmt, 0);
879
880 sqlite3_finalize(ppstmt);
881
882 mapping.second.id = rowid;
883
884 for (auto pronunciation : pronunciations[canonical])
885 {
886 if (!pronunciation.rhyme.empty())
887 {
888 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
889 } else {
890 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
891 }
892
893 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
894 {
895 db_error(ppdb, query);
896 }
897
898 sqlite3_bind_int(ppstmt, 1, rowid);
899 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
900 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
901 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
902
903 if (!pronunciation.rhyme.empty())
904 {
905 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
906 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
907 }
908
909 if (sqlite3_step(ppstmt) != SQLITE_DONE)
910 {
911 db_error(ppdb, query);
912 }
913
914 sqlite3_finalize(ppstmt);
915 } 277 }
916 } 278 }
917
918 ppgs.update();
919 } 279 }
920 } 280
921 281 void generator::readWordNetSenseKeys()
922 {
923 progress ppgs("Writing verb frames...", groups.size());
924 for (auto& mapping : groups)
925 { 282 {
926 std::list<json> roledatal; 283 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
927 std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { 284 progress ppgs("Reading sense keys from WordNet...", lines.size());
928 json role;
929 role["type"] = r.first;
930 role["selrestrs"] = export_selrestrs(r.second);
931
932 return role;
933 });
934
935 json roledata(roledatal);
936 std::string rdm = roledata.dump();
937
938 sqlite3_stmt* ppstmt;
939 std::string query("INSERT INTO groups (data) VALUES (?)");
940 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
941 {
942 db_error(ppdb, query);
943 }
944
945 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT);
946
947 if (sqlite3_step(ppstmt) != SQLITE_DONE)
948 {
949 db_error(ppdb, query);
950 }
951 285
952 sqlite3_finalize(ppstmt); 286 for (std::string line : lines)
953
954 query = "SELECT last_insert_rowid()";
955 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
956 {
957 db_error(ppdb, query);
958 }
959
960 if (sqlite3_step(ppstmt) != SQLITE_ROW)
961 {
962 db_error(ppdb, query);
963 }
964
965 int gid = sqlite3_column_int(ppstmt, 0);
966 sqlite3_finalize(ppstmt);
967
968 for (auto frame : mapping.second.frames)
969 { 287 {
970 std::list<json> fdatap; 288 ppgs.update();
971 std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) {
972 json part;
973
974 switch (fp.type)
975 {
976 case framepart_t::type_t::np:
977 {
978 part["type"] = "np";
979 part["role"] = fp.role;
980 part["selrestrs"] = export_selrestrs(fp.selrestrs);
981 part["synrestrs"] = fp.synrestrs;
982
983 break;
984 }
985
986 case framepart_t::type_t::pp:
987 {
988 part["type"] = "pp";
989 part["values"] = fp.choices;
990 part["preprestrs"] = fp.preprestrs;
991
992 break;
993 }
994
995 case framepart_t::type_t::v:
996 {
997 part["type"] = "v";
998
999 break;
1000 }
1001
1002 case framepart_t::type_t::adj:
1003 {
1004 part["type"] = "adj";
1005
1006 break;
1007 }
1008
1009 case framepart_t::type_t::adv:
1010 {
1011 part["type"] = "adv";
1012
1013 break;
1014 }
1015
1016 case framepart_t::type_t::lex:
1017 {
1018 part["type"] = "lex";
1019 part["value"] = fp.lexval;
1020
1021 break;
1022 }
1023 }
1024
1025 return part;
1026 });
1027
1028 json fdata(fdatap);
1029 std::string marshall = fdata.dump();
1030
1031 query = "INSERT INTO frames (group_id, data) VALUES (?, ?)";
1032 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1033 {
1034 db_error(ppdb, query);
1035 }
1036
1037 sqlite3_bind_int(ppstmt, 1, gid);
1038 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT);
1039 289
1040 if (sqlite3_step(ppstmt) != SQLITE_DONE) 290 // We only actually need to lookup verbs by sense key so we'll just
291 // ignore everything that isn't a verb.
292 std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$");
293 std::smatch relation_data;
294 if (!std::regex_search(line, relation_data, relation))
1041 { 295 {
1042 db_error(ppdb, query); 296 continue;
1043 } 297 }
298
299 int synset_id = stoi(relation_data[1]);
300 int wnum = stoi(relation_data[2]);
301 std::string sense_key = relation_data[3];
1044 302
1045 sqlite3_finalize(ppstmt); 303 // We are treating this mapping as injective, which is not entirely
1046 } 304 // accurate. First, the WordNet table contains duplicate rows, so those
1047 305 // need to be ignored. More importantly, a small number of sense keys
1048 for (auto member : mapping.second.members) 306 // (one for each letter of the Latin alphabet, plus 9 other words) each
1049 { 307 // map to two different words in the same synset which differ only by
1050 if (verbs.count(member) == 1) 308 // capitalization. Luckily, none of these exceptions are verbs, so we
309 // can pretend that the mapping is injective.
310 if (!wnSenseKeys_.count(sense_key))
1051 { 311 {
1052 auto& v = verbs[member]; 312 std::pair<int, int> lookup(synset_id, wnum);
1053 313 if (wordByWnidAndWnum_.count(lookup))
1054 query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)";
1055 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1056 {
1057 db_error(ppdb, query);
1058 }
1059
1060 sqlite3_bind_int(ppstmt, 1, v.id);
1061 sqlite3_bind_int(ppstmt, 2, gid);
1062
1063 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1064 { 314 {
1065 db_error(ppdb, query); 315 wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup);
1066 } 316 }
1067
1068 sqlite3_finalize(ppstmt);
1069 } 317 }
1070 } 318 }
1071
1072 ppgs.update();
1073 } 319 }
1074 } 320
1075 321 void generator::readVerbNet()
1076 // Get nouns/adjectives/adverbs from WordNet
1077 // Useful relations:
1078 // - s: master list
1079 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
1080 // - at: variation (e.g. a measurement can be standard or nonstandard)
1081 // - der: derivation (e.g. happy/happily, happily/happy)
1082 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
1083 // - ins: instantiation (do we need this? let's see)
1084 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
1085 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
1086 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
1087 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
1088 // mannernymy (e.g. something done quickly is done in a manner that is quick)
1089 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
1090 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
1091 // - syntax: positioning flags for some adjectives
1092 std::string wnpref {argv[3]};
1093 if (wnpref.back() != '/')
1094 {
1095 wnpref += '/';
1096 }
1097
1098 // s table
1099 {
1100 std::ifstream wnsfile(wnpref + "wn_s.pl");
1101 if (!wnsfile.is_open())
1102 { 322 {
1103 std::cout << "Invalid WordNet data directory." << std::endl; 323 std::cout << "Reading frames from VerbNet..." << std::endl;
1104 print_usage();
1105 }
1106 324
1107 std::list<std::string> lines; 325 DIR* dir;
1108 for (;;) 326 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
1109 {
1110 std::string line;
1111 if (!getline(wnsfile, line))
1112 { 327 {
1113 break; 328 throw std::invalid_argument("Invalid VerbNet data directory");
1114 } 329 }
1115 330
1116 if (line.back() == '\r') 331 struct dirent* ent;
1117 { 332 while ((ent = readdir(dir)) != nullptr)
1118 line.pop_back();
1119 }
1120
1121 lines.push_back(line);
1122 }
1123
1124 progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size());
1125 for (auto line : lines)
1126 {
1127 ppgs.update();
1128
1129 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$");
1130 std::smatch relation_data;
1131 if (!std::regex_search(line, relation_data, relation))
1132 { 333 {
1133 continue; 334 std::string filename(verbNetPath_);
1134 } 335
336 if (filename.back() != '/')
337 {
338 filename += '/';
339 }
1135 340
1136 int synset_id = stoi(relation_data[1]); 341 filename += ent->d_name;
1137 int wnum = stoi(relation_data[2]);
1138 std::string word = relation_data[3];
1139 size_t word_it;
1140 while ((word_it = word.find("''")) != std::string::npos)
1141 {
1142 word.erase(word_it, 1);
1143 }
1144 342
1145 std::string query; 343 if (filename.rfind(".xml") != filename.size() - 4)
1146 switch (synset_id / 100000000)
1147 {
1148 case 1: // Noun
1149 { 344 {
1150 if (nouns.count(word) == 1) 345 continue;
1151 {
1152 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1153 } else {
1154 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1155 }
1156
1157 break;
1158 } 346 }
1159 347
1160 case 2: // Verb 348 xmlDocPtr doc = xmlParseFile(filename.c_str());
349 if (doc == nullptr)
1161 { 350 {
1162 // Ignore 351 throw std::logic_error("Error opening " + filename);
1163
1164 break;
1165 } 352 }
1166 353
1167 case 3: // Adjective 354 xmlNodePtr top = xmlDocGetRootElement(doc);
355 if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
1168 { 356 {
1169 if (adjectives.count(word) == 1) 357 throw std::logic_error("Bad VerbNet file format: " + filename);
1170 {
1171 query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)";
1172 } else {
1173 query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)";
1174 }
1175
1176 break;
1177 } 358 }
1178 359
1179 case 4: // Adverb 360 try
1180 { 361 {
1181 if (adjectives.count(word) == 1) 362 createGroup(top);
1182 { 363 } catch (const std::exception& e)
1183 query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; 364 {
1184 } else { 365 std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename));
1185 query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)";
1186 }
1187
1188 break;
1189 } 366 }
1190 } 367 }
368
369 closedir(dir);
370 }
1191 371
1192 sqlite3_stmt* ppstmt; 372 void generator::readAgidInflections()
1193 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) 373 {
374 std::list<std::string> lines(readFile(agidPath_));
375 progress ppgs("Reading inflections from AGID...", lines.size());
376
377 for (std::string line : lines)
1194 { 378 {
1195 db_error(ppdb, query); 379 ppgs.update();
1196 } 380
381 int divider = line.find_first_of(" ");
382 std::string infinitive = line.substr(0, divider);
383 line = line.substr(divider+1);
384 char type = line[0];
1197 385
1198 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); 386 if (line[1] == '?')
1199 switch (synset_id / 100000000)
1200 {
1201 case 1: // Noun
1202 { 387 {
1203 sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { 388 line.erase(0, 4);
1204 return isupper(ch); 389 } else {
1205 }) ? 1 : 0)); 390 line.erase(0, 3);
1206
1207 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1208 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1209 sqlite3_bind_int(ppstmt, 5, synset_id);
1210
1211 if (nouns.count(word) == 1)
1212 {
1213 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT);
1214 }
1215
1216 break;
1217 } 391 }
1218 392
1219 case 3: // Adjective 393 if (!lemmaByBaseForm_.count(infinitive) && (type != 'V'))
1220 case 4: // Adverb
1221 { 394 {
1222 sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); 395 continue;
1223 396 }
1224 if (adjectives.count(word) == 1) 397
398 lemma& curLemma = lookupOrCreateLemma(infinitive);
399
400 auto forms = split<std::vector<std::string>>(line, " | ");
401 for (std::string& inflForm : forms)
402 {
403 int sympos = inflForm.find_first_of(",?");
404 if (sympos != std::string::npos)
1225 { 405 {
1226 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); 406 inflForm = inflForm.substr(0, sympos);
1227 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT);
1228 } 407 }
1229
1230 break;
1231 } 408 }
1232 }
1233 409
1234 if (sqlite3_step(ppstmt) != SQLITE_DONE) 410 switch (type)
1235 {
1236 db_error(ppdb, query);
1237 }
1238
1239 sqlite3_finalize(ppstmt);
1240
1241 query = "SELECT last_insert_rowid()";
1242 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1243 {
1244 db_error(ppdb, query);
1245 }
1246
1247 if (sqlite3_step(ppstmt) != SQLITE_ROW)
1248 {
1249 db_error(ppdb, query);
1250 }
1251
1252 int rowid = sqlite3_column_int(ppstmt, 0);
1253 wn[synset_id][wnum] = rowid;
1254
1255 sqlite3_finalize(ppstmt);
1256
1257 std::string canonical(word);
1258 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1259 if (pronunciations.count(canonical) == 1)
1260 {
1261 for (auto pronunciation : pronunciations[canonical])
1262 { 411 {
1263 switch (synset_id / 100000000) 412 case 'V':
1264 { 413 {
1265 case 1: // Noun 414 if (forms.size() == 4)
1266 { 415 {
1267 if (!pronunciation.rhyme.empty()) 416 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1268 { 417 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1]));
1269 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 418 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2]));
1270 } else { 419 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3]));
1271 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 420 } else if (forms.size() == 3)
1272 }
1273
1274 break;
1275 }
1276
1277 case 3: // Adjective
1278 { 421 {
1279 if (!pronunciation.rhyme.empty()) 422 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1280 { 423 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0]));
1281 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 424 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1]));
1282 } else { 425 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2]));
1283 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 426 } else if (forms.size() == 8)
1284 } 427 {
1285 428 // As of AGID 2014.08.11, this is only "to be"
1286 break; 429 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
430 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2]));
431 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3]));
432 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4]));
433 } else {
434 // Words that don't fit the cases above as of AGID 2014.08.11:
435 // - may and shall do not conjugate the way we want them to
436 // - methinks only has a past tense and is an outlier
437 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
438 std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1287 } 439 }
1288 440
1289 case 4: // Adverb 441 // For verbs in particular, we sometimes create a notion and a word
442 // from inflection data. Specifically, if there are not yet any
443 // verbs existing that have the same infinitive form. "Yet" means
444 // that this verb appears in the AGID data but not in either WordNet
445 // or VerbNet.
446 if (!wordsByBaseForm_.count(infinitive)
447 || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) {
448 return w->getNotion().getPartOfSpeech() == part_of_speech::verb;
449 }))
1290 { 450 {
1291 if (!pronunciation.rhyme.empty()) 451 notion& n = createNotion(part_of_speech::verb);
1292 { 452 createWord(n, curLemma);
1293 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
1294 } else {
1295 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
1296 }
1297
1298 break;
1299 } 453 }
1300 }
1301
1302 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1303 {
1304 db_error(ppdb, query);
1305 }
1306
1307 sqlite3_bind_int(ppstmt, 1, rowid);
1308 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
1309 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
1310 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
1311
1312 if (!pronunciation.rhyme.empty())
1313 {
1314 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
1315 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
1316 }
1317 454
1318 if (sqlite3_step(ppstmt) != SQLITE_DONE) 455 break;
1319 {
1320 db_error(ppdb, query);
1321 } 456 }
1322
1323 sqlite3_finalize(ppstmt);
1324 }
1325 }
1326 }
1327 }
1328
1329 // While we're working on s
1330 {
1331 progress ppgs("Writing word synonyms...", wn.size());
1332 for (auto sense : wn)
1333 {
1334 ppgs.update();
1335 457
1336 for (auto word1 : sense.second) 458 case 'A':
1337 {
1338 for (auto word2 : sense.second)
1339 {
1340 if (word1 != word2)
1341 { 459 {
1342 std::string query; 460 if (forms.size() == 2)
1343 switch (sense.first / 100000000)
1344 { 461 {
1345 case 1: // Noun 462 curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0]));
1346 { 463 curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1]));
1347 query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; 464 } else {
1348 465 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
1349 break; 466 std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1350 } 467 }
1351
1352 case 2: // Verb
1353 {
1354 // Ignore
1355
1356 break;
1357 }
1358
1359 case 3: // Adjective
1360 {
1361 query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1362 468
1363 break; 469 break;
1364 } 470 }
1365 471
1366 case 4: // Adverb 472 case 'N':
1367 { 473 {
1368 query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; 474 if (forms.size() == 1)
1369
1370 break;
1371 }
1372 }
1373
1374 sqlite3_stmt* ppstmt;
1375 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1376 {
1377 db_error(ppdb, query);
1378 }
1379
1380 sqlite3_bind_int(ppstmt, 1, word1.second);
1381 sqlite3_bind_int(ppstmt, 2, word2.second);
1382
1383 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1384 { 475 {
1385 db_error(ppdb, query); 476 curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0]));
477 } else {
478 // As of AGID 2014.08.11, this is non-existent.
479 std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1386 } 480 }
1387 481
1388 sqlite3_finalize(ppstmt); 482 break;
1389 } 483 }
1390 } 484 }
1391 } 485 }
1392 } 486 }
1393 }
1394
1395 // ant table
1396 {
1397 std::ifstream wnantfile(wnpref + "wn_ant.pl");
1398 if (!wnantfile.is_open())
1399 {
1400 std::cout << "Invalid WordNet data directory." << std::endl;
1401 print_usage();
1402 }
1403
1404 std::list<std::string> lines;
1405 for (;;)
1406 {
1407 std::string line;
1408 if (!getline(wnantfile, line))
1409 {
1410 break;
1411 }
1412 487
1413 if (line.back() == '\r') 488 void generator::readPrepositions()
1414 {
1415 line.pop_back();
1416 }
1417
1418 lines.push_back(line);
1419 }
1420
1421 progress ppgs("Writing antonyms...", lines.size());
1422 for (auto line : lines)
1423 { 489 {
1424 ppgs.update(); 490 std::list<std::string> lines(readFile("prepositions.txt"));
491 progress ppgs("Reading prepositions...", lines.size());
1425 492
1426 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); 493 for (std::string line : lines)
1427 std::smatch relation_data;
1428 if (!std::regex_search(line, relation_data, relation))
1429 {
1430 continue;
1431 }
1432
1433 int synset_id_1 = stoi(relation_data[1]);
1434 int wnum_1 = stoi(relation_data[2]);
1435 int synset_id_2 = stoi(relation_data[3]);
1436 int wnum_2 = stoi(relation_data[4]);
1437
1438 std::string query;
1439 switch (synset_id_1 / 100000000)
1440 { 494 {
1441 case 1: // Noun 495 ppgs.update();
1442 {
1443 query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
1444 496
1445 break; 497 std::regex relation("^([^:]+): (.+)");
1446 } 498 std::smatch relation_data;
1447 499 std::regex_search(line, relation_data, relation);
1448 case 2: // Verb 500 std::string prep = relation_data[1];
1449 { 501 auto groups = split<std::list<std::string>>(relation_data[2], ", ");
1450 // Ignore
1451 502
1452 break; 503 notion& n = createNotion(part_of_speech::preposition);
1453 } 504 lemma& l = lookupOrCreateLemma(prep);
1454 505 word& w = createWord(n, l);
1455 case 3: // Adjective
1456 {
1457 query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1458 506
1459 break; 507 n.setPrepositionGroups(groups);
1460 }
1461
1462 case 4: // Adverb
1463 {
1464 query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1465
1466 break;
1467 }
1468 }
1469
1470 sqlite3_stmt* ppstmt;
1471 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1472 {
1473 db_error(ppdb, query);
1474 }
1475
1476 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1477 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1478
1479 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1480 {
1481 db_error(ppdb, query);
1482 }
1483
1484 sqlite3_finalize(ppstmt);
1485 }
1486 }
1487
1488 // at table
1489 {
1490 std::ifstream wnatfile(wnpref + "wn_at.pl");
1491 if (!wnatfile.is_open())
1492 {
1493 std::cout << "Invalid WordNet data directory." << std::endl;
1494 print_usage();
1495 }
1496
1497 std::list<std::string> lines;
1498 for (;;)
1499 {
1500 std::string line;
1501 if (!getline(wnatfile, line))
1502 {
1503 break;
1504 } 508 }
1505
1506 if (line.back() == '\r')
1507 {
1508 line.pop_back();
1509 }
1510
1511 lines.push_back(line);
1512 } 509 }
1513 510
1514 progress ppgs("Writing variations...", lines.size()); 511 void generator::readCmudictPronunciations()
1515 for (auto line : lines)
1516 { 512 {
1517 ppgs.update(); 513 std::list<std::string> lines(readFile(cmudictPath_));
514 progress ppgs("Reading pronunciations from CMUDICT...", lines.size());
1518 515
1519 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); 516 for (std::string line : lines)
1520 std::smatch relation_data;
1521 if (!std::regex_search(line, relation_data, relation))
1522 { 517 {
1523 continue; 518 ppgs.update();
1524 } 519
1525 520 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
1526 int synset_id_1 = stoi(relation_data[1]); 521 std::smatch phoneme_data;
1527 int synset_id_2 = stoi(relation_data[2]); 522 if (std::regex_search(line, phoneme_data, phoneme))
1528 std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)");
1529
1530 for (auto mapping1 : wn[synset_id_1])
1531 {
1532 for (auto mapping2 : wn[synset_id_2])
1533 { 523 {
1534 sqlite3_stmt* ppstmt; 524 std::string canonical(phoneme_data[1]);
1535 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 525 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1536 {
1537 db_error(ppdb, query);
1538 }
1539
1540 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1541 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1542 526
1543 if (sqlite3_step(ppstmt) != SQLITE_DONE) 527 if (!formByText_.count(canonical))
1544 { 528 {
1545 db_error(ppdb, query); 529 continue;
1546 } 530 }
1547 531
1548 sqlite3_finalize(ppstmt); 532 std::string phonemes = phoneme_data[2];
533 pronunciations_.emplace_back(phonemes);
534 pronunciation& p = pronunciations_.back();
535 formByText_.at(canonical)->addPronunciation(p);
1549 } 536 }
1550 } 537 }
1551 } 538 }
1552 }
1553
1554 // der table
1555 {
1556 std::ifstream wnderfile(wnpref + "wn_der.pl");
1557 if (!wnderfile.is_open())
1558 {
1559 std::cout << "Invalid WordNet data directory." << std::endl;
1560 print_usage();
1561 }
1562 539
1563 std::list<std::string> lines; 540 void generator::writeSchema()
1564 for (;;)
1565 { 541 {
1566 std::string line; 542 std::ifstream file("schema.sql");
1567 if (!getline(wnderfile, line)) 543 if (!file)
1568 { 544 {
1569 break; 545 throw std::invalid_argument("Could not find database schema");
1570 } 546 }
1571 547
1572 if (line.back() == '\r') 548 std::ostringstream schemaBuilder;
549 std::string line;
550 while (std::getline(file, line))
1573 { 551 {
1574 line.pop_back(); 552 if (line.back() == '\r')
553 {
554 line.pop_back();
555 }
556
557 schemaBuilder << line;
1575 } 558 }
1576 559
1577 lines.push_back(line); 560 std::string schema = schemaBuilder.str();
561 auto queries = split<std::list<std::string>>(schema, ";");
562 progress ppgs("Writing database schema...", queries.size());
563 for (std::string query : queries)
564 {
565 if (!queries.empty())
566 {
567 db_.runQuery(query);
568 }
569
570 ppgs.update();
571 }
1578 } 572 }
1579 573
1580 progress ppgs("Writing morphological derivation...", lines.size()); 574 void generator::dumpObjects()
1581 for (auto line : lines)
1582 { 575 {
1583 ppgs.update();
1584
1585 std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1586 std::smatch relation_data;
1587 if (!std::regex_search(line, relation_data, relation))
1588 { 576 {
1589 continue; 577 progress ppgs("Writing notions...", notions_.size());
578
579 for (notion& n : notions_)
580 {
581 db_ << n;
582
583 ppgs.update();
584 }
1590 } 585 }
1591 586
1592 int synset_id_1 = stoi(relation_data[1]);
1593 int wnum_1 = stoi(relation_data[2]);
1594 int synset_id_2 = stoi(relation_data[3]);
1595 int wnum_2 = stoi(relation_data[4]);
1596 std::string query;
1597 switch (synset_id_1 / 100000000)
1598 { 587 {
1599 case 1: // Noun 588 progress ppgs("Writing words...", words_.size());
589
590 for (word& w : words_)
1600 { 591 {
1601 switch (synset_id_2 / 100000000) 592 db_ << w;
1602 {
1603 case 1: // Noun
1604 {
1605 query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)";
1606 break;
1607 }
1608
1609 case 3: // Adjective
1610 {
1611 query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)";
1612 break;
1613 }
1614
1615 case 4: // Adverb
1616 {
1617 query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)";
1618 break;
1619 }
1620 }
1621 593
1622 break; 594 ppgs.update();
1623 } 595 }
596 }
597
598 {
599 progress ppgs("Writing lemmas...", lemmas_.size());
1624 600
1625 case 3: // Adjective 601 for (lemma& l : lemmas_)
1626 { 602 {
1627 switch (synset_id_2 / 100000000) 603 db_ << l;
1628 {
1629 case 1: // Noun
1630 {
1631 query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)";
1632 break;
1633 }
1634
1635 case 3: // Adjective
1636 {
1637 query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)";
1638 break;
1639 }
1640
1641 case 4: // Adverb
1642 {
1643 query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)";
1644 break;
1645 }
1646 }
1647 604
1648 break; 605 ppgs.update();
1649 } 606 }
607 }
608
609 {
610 progress ppgs("Writing forms...", forms_.size());
1650 611
1651 case 4: // Adverb 612 for (form& f : forms_)
1652 { 613 {
1653 switch (synset_id_2 / 100000000) 614 db_ << f;
1654 {
1655 case 1: // Noun
1656 {
1657 query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)";
1658 break;
1659 }
1660
1661 case 3: // Adjective
1662 {
1663 query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)";
1664 break;
1665 }
1666
1667 case 4: // Adverb
1668 {
1669 query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1670 break;
1671 }
1672 }
1673 615
1674 break; 616 ppgs.update();
1675 } 617 }
1676 } 618 }
1677 619
1678 sqlite3_stmt* ppstmt;
1679 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1680 { 620 {
1681 db_error(ppdb, query); 621 progress ppgs("Writing pronunciations...", pronunciations_.size());
622
623 for (pronunciation& p : pronunciations_)
624 {
625 db_ << p;
626
627 ppgs.update();
628 }
1682 } 629 }
1683 630
1684 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1685 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1686
1687 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1688 { 631 {
1689 db_error(ppdb, query); 632 progress ppgs("Writing verb groups...", groups_.size());
633
634 for (group& g : groups_)
635 {
636 db_ << g;
637
638 ppgs.update();
639 }
1690 } 640 }
1691 641
1692 sqlite3_finalize(ppstmt);
1693 }
1694 }
1695
1696 // hyp table
1697 {
1698 std::ifstream wnhypfile(wnpref + "wn_hyp.pl");
1699 if (!wnhypfile.is_open())
1700 {
1701 std::cout << "Invalid WordNet data directory." << std::endl;
1702 print_usage();
1703 }
1704
1705 std::list<std::string> lines;
1706 for (;;)
1707 {
1708 std::string line;
1709 if (!getline(wnhypfile, line))
1710 {
1711 break;
1712 }
1713
1714 if (line.back() == '\r')
1715 { 642 {
1716 line.pop_back(); 643 progress ppgs("Writing verb frames...", frames_.size());
644
645 for (frame& f : frames_)
646 {
647 db_ << f;
648
649 ppgs.update();
650 }
1717 } 651 }
1718
1719 lines.push_back(line);
1720 } 652 }
1721 653
1722 progress ppgs("Writing hypernyms...", lines.size()); 654 void generator::readWordNetAntonymy()
1723 for (auto line : lines)
1724 { 655 {
1725 ppgs.update(); 656 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
1726 657 progress ppgs("Writing antonyms...", lines.size());
1727 std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); 658 for (auto line : lines)
1728 std::smatch relation_data;
1729 if (!std::regex_search(line, relation_data, relation))
1730 { 659 {
1731 continue; 660 ppgs.update();
1732 }
1733
1734 int synset_id_1 = stoi(relation_data[1]);
1735 int synset_id_2 = stoi(relation_data[2]);
1736 std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)");
1737 661
1738 for (auto mapping1 : wn[synset_id_1]) 662 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1739 { 663 std::smatch relation_data;
1740 for (auto mapping2 : wn[synset_id_2]) 664 if (!std::regex_search(line, relation_data, relation))
1741 { 665 {
1742 sqlite3_stmt* ppstmt; 666 continue;
1743 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 667 }
1744 { 668
1745 db_error(ppdb, query); 669 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1746 } 670 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1747 671
1748 sqlite3_bind_int(ppstmt, 1, mapping1.second); 672 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1749 sqlite3_bind_int(ppstmt, 2, mapping2.second); 673 {
674 word& word1 = *wordByWnidAndWnum_.at(lookup1);
675 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1750 676
1751 if (sqlite3_step(ppstmt) != SQLITE_DONE) 677 std::list<field> fields;
1752 { 678 fields.emplace_back("antonym_1_id", word1.getId());
1753 db_error(ppdb, query); 679 fields.emplace_back("antonym_2_id", word2.getId());
1754 }
1755 680
1756 sqlite3_finalize(ppstmt); 681 db_.insertIntoTable("antonymy", std::move(fields));
1757 } 682 }
1758 } 683 }
1759 } 684 }
1760 }
1761
1762 // ins table
1763 {
1764 std::ifstream wninsfile(wnpref + "wn_ins.pl");
1765 if (!wninsfile.is_open())
1766 {
1767 std::cout << "Invalid WordNet data directory." << std::endl;
1768 print_usage();
1769 }
1770
1771 std::list<std::string> lines;
1772 for (;;)
1773 {
1774 std::string line;
1775 if (!getline(wninsfile, line))
1776 {
1777 break;
1778 }
1779 685
1780 if (line.back() == '\r') 686 void generator::readWordNetVariation()
687 {
688 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
689 progress ppgs("Writing variation...", lines.size());
690 for (auto line : lines)
1781 { 691 {
1782 line.pop_back(); 692 ppgs.update();
1783 }
1784 693
1785 lines.push_back(line); 694 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
695 std::smatch relation_data;
696 if (!std::regex_search(line, relation_data, relation))
697 {
698 continue;
699 }
700
701 int lookup1 = std::stoi(relation_data[1]);
702 int lookup2 = std::stoi(relation_data[2]);
703
704 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
705 {
706 notion& notion1 = *notionByWnid_.at(lookup1);
707 notion& notion2 = *notionByWnid_.at(lookup2);
708
709 std::list<field> fields;
710 fields.emplace_back("noun_id", notion1.getId());
711 fields.emplace_back("adjective_id", notion2.getId());
712
713 db_.insertIntoTable("variation", std::move(fields));
714 }
715 }
1786 } 716 }
1787 717
1788 progress ppgs("Writing instantiations...", lines.size()); 718 void generator::readWordNetClasses()
1789 for (auto line : lines)
1790 { 719 {
1791 ppgs.update(); 720 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
1792 721 progress ppgs("Writing usage, topicality, and regionality...", lines.size());
1793 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); 722 for (auto line : lines)
1794 std::smatch relation_data;
1795 if (!std::regex_search(line, relation_data, relation))
1796 { 723 {
1797 continue; 724 ppgs.update();
1798 }
1799
1800 int synset_id_1 = stoi(relation_data[1]);
1801 int synset_id_2 = stoi(relation_data[2]);
1802 std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)");
1803 725
1804 for (auto mapping1 : wn[synset_id_1]) 726 std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
1805 { 727 std::smatch relation_data;
1806 for (auto mapping2 : wn[synset_id_2]) 728 if (!std::regex_search(line, relation_data, relation))
729 {
730 continue;
731 }
732
733 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
734 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
735 std::string class_type = relation_data[5];
736
737 std::string table_name;
738 if (class_type == "t")
739 {
740 table_name += "topicality";
741 } else if (class_type == "u")
742 {
743 table_name += "usage";
744 } else if (class_type == "r")
745 {
746 table_name += "regionality";
747 }
748
749 std::list<int> leftJoin;
750 std::list<int> rightJoin;
751
752 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
1807 { 753 {
1808 sqlite3_stmt* ppstmt; 754 std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) {
1809 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 755 return w->getId();
756 });
757 } else if (wordByWnidAndWnum_.count(lookup1)) {
758 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
759 }
760
761 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
762 {
763 std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) {
764 return w->getId();
765 });
766 } else if (wordByWnidAndWnum_.count(lookup2)) {
767 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
768 }
769
770 for (int word1 : leftJoin)
771 {
772 for (int word2 : rightJoin)
1810 { 773 {
1811 db_error(ppdb, query); 774 std::list<field> fields;
1812 } 775 fields.emplace_back("term_id", word1);
776 fields.emplace_back("domain_id", word2);
1813 777
1814 sqlite3_bind_int(ppstmt, 1, mapping1.second); 778 db_.insertIntoTable(table_name, std::move(fields));
1815 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1816
1817 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1818 {
1819 db_error(ppdb, query);
1820 } 779 }
1821
1822 sqlite3_finalize(ppstmt);
1823 } 780 }
1824 } 781 }
1825 } 782 }
1826 }
1827
1828 // mm table
1829 {
1830 std::ifstream wnmmfile(wnpref + "wn_mm.pl");
1831 if (!wnmmfile.is_open())
1832 {
1833 std::cout << "Invalid WordNet data directory." << std::endl;
1834 print_usage();
1835 }
1836
1837 std::list<std::string> lines;
1838 for (;;)
1839 {
1840 std::string line;
1841 if (!getline(wnmmfile, line))
1842 {
1843 break;
1844 }
1845 783
1846 if (line.back() == '\r') 784 void generator::readWordNetCausality()
785 {
786 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
787 progress ppgs("Writing causality...", lines.size());
788 for (auto line : lines)
1847 { 789 {
1848 line.pop_back(); 790 ppgs.update();
1849 }
1850 791
1851 lines.push_back(line); 792 std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\.");
793 std::smatch relation_data;
794 if (!std::regex_search(line, relation_data, relation))
795 {
796 continue;
797 }
798
799 int lookup1 = std::stoi(relation_data[1]);
800 int lookup2 = std::stoi(relation_data[2]);
801
802 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
803 {
804 notion& notion1 = *notionByWnid_.at(lookup1);
805 notion& notion2 = *notionByWnid_.at(lookup2);
806
807 std::list<field> fields;
808 fields.emplace_back("effect_id", notion1.getId());
809 fields.emplace_back("cause_id", notion2.getId());
810
811 db_.insertIntoTable("causality", std::move(fields));
812 }
813 }
1852 } 814 }
1853 815
1854 progress ppgs("Writing member meronyms...", lines.size()); 816 void generator::readWordNetEntailment()
1855 for (auto line : lines)
1856 { 817 {
1857 ppgs.update(); 818 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
1858 819 progress ppgs("Writing entailment...", lines.size());
1859 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); 820 for (auto line : lines)
1860 std::smatch relation_data;
1861 if (!std::regex_search(line, relation_data, relation))
1862 { 821 {
1863 continue; 822 ppgs.update();
1864 }
1865 823
1866 int synset_id_1 = stoi(relation_data[1]); 824 std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\.");
1867 int synset_id_2 = stoi(relation_data[2]); 825 std::smatch relation_data;
1868 std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); 826 if (!std::regex_search(line, relation_data, relation))
1869
1870 for (auto mapping1 : wn[synset_id_1])
1871 {
1872 for (auto mapping2 : wn[synset_id_2])
1873 { 827 {
1874 sqlite3_stmt* ppstmt; 828 continue;
1875 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 829 }
1876 { 830
1877 db_error(ppdb, query); 831 int lookup1 = std::stoi(relation_data[1]);
1878 } 832 int lookup2 = std::stoi(relation_data[2]);
1879 833
1880 sqlite3_bind_int(ppstmt, 1, mapping1.second); 834 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
1881 sqlite3_bind_int(ppstmt, 2, mapping2.second); 835 {
836 notion& notion1 = *notionByWnid_.at(lookup1);
837 notion& notion2 = *notionByWnid_.at(lookup2);
1882 838
1883 if (sqlite3_step(ppstmt) != SQLITE_DONE) 839 std::list<field> fields;
1884 { 840 fields.emplace_back("given_id", notion1.getId());
1885 db_error(ppdb, query); 841 fields.emplace_back("entailment_id", notion2.getId());
1886 }
1887 842
1888 sqlite3_finalize(ppstmt); 843 db_.insertIntoTable("entailment", std::move(fields));
1889 } 844 }
1890 } 845 }
1891 } 846 }
1892 } 847
1893 848 void generator::readWordNetHypernymy()
1894 // ms table
1895 {
1896 std::ifstream wnmsfile(wnpref + "wn_ms.pl");
1897 if (!wnmsfile.is_open())
1898 {
1899 std::cout << "Invalid WordNet data directory." << std::endl;
1900 print_usage();
1901 }
1902
1903 std::list<std::string> lines;
1904 for (;;)
1905 { 849 {
1906 std::string line; 850 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
1907 if (!getline(wnmsfile, line)) 851 progress ppgs("Writing hypernymy...", lines.size());
852 for (auto line : lines)
1908 { 853 {
1909 break; 854 ppgs.update();
855
856 std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\.");
857 std::smatch relation_data;
858 if (!std::regex_search(line, relation_data, relation))
859 {
860 continue;
861 }
862
863 int lookup1 = std::stoi(relation_data[1]);
864 int lookup2 = std::stoi(relation_data[2]);
865
866 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
867 {
868 notion& notion1 = *notionByWnid_.at(lookup1);
869 notion& notion2 = *notionByWnid_.at(lookup2);
870
871 std::list<field> fields;
872 fields.emplace_back("hyponym_id", notion1.getId());
873 fields.emplace_back("hypernym_id", notion2.getId());
874
875 db_.insertIntoTable("hypernymy", std::move(fields));
876 }
1910 } 877 }
878 }
1911 879
1912 if (line.back() == '\r') 880 void generator::readWordNetInstantiation()
881 {
882 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
883 progress ppgs("Writing instantiation...", lines.size());
884 for (auto line : lines)
1913 { 885 {
1914 line.pop_back(); 886 ppgs.update();
1915 }
1916 887
1917 lines.push_back(line); 888 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
889 std::smatch relation_data;
890 if (!std::regex_search(line, relation_data, relation))
891 {
892 continue;
893 }
894
895 int lookup1 = std::stoi(relation_data[1]);
896 int lookup2 = std::stoi(relation_data[2]);
897
898 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
899 {
900 notion& notion1 = *notionByWnid_.at(lookup1);
901 notion& notion2 = *notionByWnid_.at(lookup2);
902
903 std::list<field> fields;
904 fields.emplace_back("instance_id", notion1.getId());
905 fields.emplace_back("class_id", notion2.getId());
906
907 db_.insertIntoTable("instantiation", std::move(fields));
908 }
909 }
1918 } 910 }
1919 911
1920 progress ppgs("Writing substance meronyms...", lines.size()); 912 void generator::readWordNetMemberMeronymy()
1921 for (auto line : lines)
1922 { 913 {
1923 ppgs.update(); 914 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
1924 915 progress ppgs("Writing member meronymy...", lines.size());
1925 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); 916 for (auto line : lines)
1926 std::smatch relation_data;
1927 if (!std::regex_search(line, relation_data, relation))
1928 { 917 {
1929 continue; 918 ppgs.update();
1930 }
1931
1932 int synset_id_1 = stoi(relation_data[1]);
1933 int synset_id_2 = stoi(relation_data[2]);
1934 std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1935 919
1936 for (auto mapping1 : wn[synset_id_1]) 920 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
1937 { 921 std::smatch relation_data;
1938 for (auto mapping2 : wn[synset_id_2]) 922 if (!std::regex_search(line, relation_data, relation))
1939 { 923 {
1940 sqlite3_stmt* ppstmt; 924 continue;
1941 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 925 }
1942 { 926
1943 db_error(ppdb, query); 927 int lookup1 = std::stoi(relation_data[1]);
1944 } 928 int lookup2 = std::stoi(relation_data[2]);
929
930 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
931 {
932 notion& notion1 = *notionByWnid_.at(lookup1);
933 notion& notion2 = *notionByWnid_.at(lookup2);
1945 934
1946 sqlite3_bind_int(ppstmt, 1, mapping1.second); 935 std::list<field> fields;
1947 sqlite3_bind_int(ppstmt, 2, mapping2.second); 936 fields.emplace_back("holonym_id", notion1.getId());
937 fields.emplace_back("meronym_id", notion2.getId());
1948 938
1949 if (sqlite3_step(ppstmt) != SQLITE_DONE) 939 db_.insertIntoTable("member_meronymy", std::move(fields));
1950 {
1951 db_error(ppdb, query);
1952 }
1953
1954 sqlite3_finalize(ppstmt);
1955 } 940 }
1956 } 941 }
1957 } 942 }
1958 } 943
1959 944 void generator::readWordNetPartMeronymy()
1960 // mm table
1961 {
1962 std::ifstream wnmpfile(wnpref + "wn_mp.pl");
1963 if (!wnmpfile.is_open())
1964 {
1965 std::cout << "Invalid WordNet data directory." << std::endl;
1966 print_usage();
1967 }
1968
1969 std::list<std::string> lines;
1970 for (;;)
1971 { 945 {
1972 std::string line; 946 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
1973 if (!getline(wnmpfile, line)) 947 progress ppgs("Writing part meronymy...", lines.size());
948 for (auto line : lines)
1974 { 949 {
1975 break; 950 ppgs.update();
951
952 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
953 std::smatch relation_data;
954 if (!std::regex_search(line, relation_data, relation))
955 {
956 continue;
957 }
958
959 int lookup1 = std::stoi(relation_data[1]);
960 int lookup2 = std::stoi(relation_data[2]);
961
962 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
963 {
964 notion& notion1 = *notionByWnid_.at(lookup1);
965 notion& notion2 = *notionByWnid_.at(lookup2);
966
967 std::list<field> fields;
968 fields.emplace_back("holonym_id", notion1.getId());
969 fields.emplace_back("meronym_id", notion2.getId());
970
971 db_.insertIntoTable("part_meronymy", std::move(fields));
972 }
1976 } 973 }
974 }
1977 975
1978 if (line.back() == '\r') 976 void generator::readWordNetSubstanceMeronymy()
977 {
978 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
979 progress ppgs("Writing substance meronymy...", lines.size());
980 for (auto line : lines)
1979 { 981 {
1980 line.pop_back(); 982 ppgs.update();
1981 }
1982 983
1983 lines.push_back(line); 984 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
985 std::smatch relation_data;
986 if (!std::regex_search(line, relation_data, relation))
987 {
988 continue;
989 }
990
991 int lookup1 = std::stoi(relation_data[1]);
992 int lookup2 = std::stoi(relation_data[2]);
993
994 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
995 {
996 notion& notion1 = *notionByWnid_.at(lookup1);
997 notion& notion2 = *notionByWnid_.at(lookup2);
998
999 std::list<field> fields;
1000 fields.emplace_back("holonym_id", notion1.getId());
1001 fields.emplace_back("meronym_id", notion2.getId());
1002
1003 db_.insertIntoTable("substance_meronymy", std::move(fields));
1004 }
1005 }
1984 } 1006 }
1985 1007
1986 progress ppgs("Writing part meronyms...", lines.size()); 1008 void generator::readWordNetPertainymy()
1987 for (auto line : lines)
1988 { 1009 {
1989 ppgs.update(); 1010 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
1990 1011 progress ppgs("Writing pertainymy and mannernymy...", lines.size());
1991 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); 1012 for (auto line : lines)
1992 std::smatch relation_data;
1993 if (!std::regex_search(line, relation_data, relation))
1994 { 1013 {
1995 continue; 1014 ppgs.update();
1996 }
1997
1998 int synset_id_1 = stoi(relation_data[1]);
1999 int synset_id_2 = stoi(relation_data[2]);
2000 std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
2001 1015
2002 for (auto mapping1 : wn[synset_id_1]) 1016 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
2003 { 1017 std::smatch relation_data;
2004 for (auto mapping2 : wn[synset_id_2]) 1018 if (!std::regex_search(line, relation_data, relation))
2005 { 1019 {
2006 sqlite3_stmt* ppstmt; 1020 continue;
2007 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 1021 }
2008 { 1022
2009 db_error(ppdb, query); 1023 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
2010 } 1024 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1025
1026 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1027 {
1028 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1029 word& word2 = *wordByWnidAndWnum_.at(lookup2);
2011 1030
2012 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1031 if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective)
2013 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1032 {
1033 std::list<field> fields;
1034 fields.emplace_back("pertainym_id", word1.getId());
1035 fields.emplace_back("noun_id", word2.getId());
2014 1036
2015 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1037 db_.insertIntoTable("pertainymy", std::move(fields));
1038 } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb)
2016 { 1039 {
2017 db_error(ppdb, query); 1040 std::list<field> fields;
2018 } 1041 fields.emplace_back("mannernym_id", word1.getId());
1042 fields.emplace_back("adjective_id", word2.getId());
2019 1043
2020 sqlite3_finalize(ppstmt); 1044 db_.insertIntoTable("mannernymy", std::move(fields));
1045 }
2021 } 1046 }
2022 } 1047 }
2023 } 1048 }
2024 }
2025
2026 // per table
2027 {
2028 std::ifstream wnperfile(wnpref + "wn_per.pl");
2029 if (!wnperfile.is_open())
2030 {
2031 std::cout << "Invalid WordNet data directory." << std::endl;
2032 print_usage();
2033 }
2034
2035 std::list<std::string> lines;
2036 for (;;)
2037 {
2038 std::string line;
2039 if (!getline(wnperfile, line))
2040 {
2041 break;
2042 }
2043 1049
2044 if (line.back() == '\r') 1050 void generator::readWordNetSpecification()
1051 {
1052 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
1053 progress ppgs("Writing specifications...", lines.size());
1054 for (auto line : lines)
2045 { 1055 {
2046 line.pop_back(); 1056 ppgs.update();
1057
1058 std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\.");
1059 std::smatch relation_data;
1060 if (!std::regex_search(line, relation_data, relation))
1061 {
1062 continue;
1063 }
1064
1065 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1066 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1067
1068 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1069 {
1070 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1071 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1072
1073 std::list<field> fields;
1074 fields.emplace_back("general_id", word1.getId());
1075 fields.emplace_back("specific_id", word2.getId());
1076
1077 db_.insertIntoTable("specification", std::move(fields));
1078 }
2047 } 1079 }
2048
2049 lines.push_back(line);
2050 } 1080 }
2051 1081
2052 progress ppgs("Writing pertainyms and mannernyms...", lines.size()); 1082 void generator::readWordNetSimilarity()
2053 for (auto line : lines)
2054 { 1083 {
2055 ppgs.update(); 1084 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
2056 1085 progress ppgs("Writing adjective similarity...", lines.size());
2057 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); 1086 for (auto line : lines)
2058 std::smatch relation_data;
2059 if (!std::regex_search(line, relation_data, relation))
2060 { 1087 {
2061 continue; 1088 ppgs.update();
2062 }
2063 1089
2064 int synset_id_1 = stoi(relation_data[1]); 1090 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
2065 int wnum_1 = stoi(relation_data[2]); 1091 std::smatch relation_data;
2066 int synset_id_2 = stoi(relation_data[3]); 1092 if (!std::regex_search(line, relation_data, relation))
2067 int wnum_2 = stoi(relation_data[4]);
2068 std::string query;
2069 switch (synset_id_1 / 100000000)
2070 {
2071 case 3: // Adjective
2072 { 1093 {
2073 // This is a pertainym, the second word should be a noun 1094 continue;
2074 // Technically it can be an adjective but we're ignoring that
2075 if (synset_id_2 / 100000000 != 1)
2076 {
2077 continue;
2078 }
2079
2080 query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)";
2081
2082 break;
2083 } 1095 }
1096
1097 int lookup1 = std::stoi(relation_data[1]);
1098 int lookup2 = std::stoi(relation_data[2]);
2084 1099
2085 case 4: // Adverb 1100 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
2086 { 1101 {
2087 // This is a mannernym, the second word should be an adjective 1102 notion& notion1 = *notionByWnid_.at(lookup1);
2088 if (synset_id_2 / 100000000 != 3) 1103 notion& notion2 = *notionByWnid_.at(lookup2);
2089 {
2090 continue;
2091 }
2092 1104
2093 query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; 1105 std::list<field> fields;
1106 fields.emplace_back("adjective_1_id", notion1.getId());
1107 fields.emplace_back("adjective_2_id", notion2.getId());
2094 1108
2095 break; 1109 db_.insertIntoTable("similarity", std::move(fields));
2096 } 1110 }
2097 } 1111 }
2098 1112 }
2099 sqlite3_stmt* ppstmt;
2100 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2101 {
2102 db_error(ppdb, query);
2103 }
2104
2105 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
2106 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
2107 1113
2108 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1114 std::list<std::string> generator::readFile(std::string path)
1115 {
1116 std::ifstream file(path);
1117 if (!file)
2109 { 1118 {
2110 db_error(ppdb, query); 1119 throw std::invalid_argument("Could not find file " + path);
2111 } 1120 }
2112
2113 sqlite3_finalize(ppstmt);
2114 }
2115 }
2116 1121
2117 // sa table 1122 std::list<std::string> lines;
2118 {
2119 std::ifstream wnsafile(wnpref + "wn_sa.pl");
2120 if (!wnsafile.is_open())
2121 {
2122 std::cout << "Invalid WordNet data directory." << std::endl;
2123 print_usage();
2124 }
2125
2126 std::list<std::string> lines;
2127 for (;;)
2128 {
2129 std::string line; 1123 std::string line;
2130 if (!getline(wnsafile, line)) 1124 while (std::getline(file, line))
2131 {
2132 break;
2133 }
2134
2135 if (line.back() == '\r')
2136 { 1125 {
2137 line.pop_back(); 1126 if (line.back() == '\r')
1127 {
1128 line.pop_back();
1129 }
1130
1131 lines.push_back(line);
2138 } 1132 }
2139 1133
2140 lines.push_back(line); 1134 return lines;
2141 } 1135 }
2142 1136
2143 progress ppgs("Writing specifications...", lines.size()); 1137 part_of_speech generator::partOfSpeechByWnid(int wnid)
2144 for (auto line : lines)
2145 { 1138 {
2146 ppgs.update(); 1139 switch (wnid / 100000000)
2147
2148 std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\.");
2149 std::smatch relation_data;
2150 if (!std::regex_search(line, relation_data, relation))
2151 {
2152 continue;
2153 }
2154
2155 int synset_id_1 = stoi(relation_data[1]);
2156 int wnum_1 = stoi(relation_data[2]);
2157 int synset_id_2 = stoi(relation_data[3]);
2158 int wnum_2 = stoi(relation_data[4]);
2159 std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)");
2160
2161 sqlite3_stmt* ppstmt;
2162 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2163 { 1140 {
2164 db_error(ppdb, query); 1141 case 1: return part_of_speech::noun;
1142 case 2: return part_of_speech::verb;
1143 case 3: return part_of_speech::adjective;
1144 case 4: return part_of_speech::adverb;
1145 default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid));
2165 } 1146 }
1147 }
2166 1148
2167 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); 1149 notion& generator::createNotion(part_of_speech partOfSpeech)
2168 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); 1150 {
1151 notions_.emplace_back(partOfSpeech);
1152
1153 return notions_.back();
1154 }
2169 1155
2170 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1156 notion& generator::lookupOrCreateNotion(int wnid)
1157 {
1158 if (!notionByWnid_.count(wnid))
2171 { 1159 {
2172 db_error(ppdb, query); 1160 notions_.emplace_back(partOfSpeechByWnid(wnid), wnid);
1161 notionByWnid_[wnid] = &notions_.back();
2173 } 1162 }
2174 1163
2175 sqlite3_finalize(ppstmt); 1164 return *notionByWnid_.at(wnid);
2176 }
2177 }
2178
2179 // sim table
2180 {
2181 std::ifstream wnsimfile(wnpref + "wn_sim.pl");
2182 if (!wnsimfile.is_open())
2183 {
2184 std::cout << "Invalid WordNet data directory." << std::endl;
2185 print_usage();
2186 } 1165 }
2187 1166
2188 std::list<std::string> lines; 1167 lemma& generator::lookupOrCreateLemma(std::string base_form)
2189 for (;;)
2190 { 1168 {
2191 std::string line; 1169 if (!lemmaByBaseForm_.count(base_form))
2192 if (!getline(wnsimfile, line))
2193 { 1170 {
2194 break; 1171 lemmas_.emplace_back(lookupOrCreateForm(base_form));
1172 lemmaByBaseForm_[base_form] = &lemmas_.back();
2195 } 1173 }
1174
1175 return *lemmaByBaseForm_.at(base_form);
1176 }
2196 1177
2197 if (line.back() == '\r') 1178 form& generator::lookupOrCreateForm(std::string text)
1179 {
1180 if (!formByText_.count(text))
2198 { 1181 {
2199 line.pop_back(); 1182 forms_.emplace_back(text);
1183 formByText_[text] = &forms_.back();
2200 } 1184 }
2201 1185
2202 lines.push_back(line); 1186 return *formByText_[text];
2203 } 1187 }
2204 1188
2205 progress ppgs("Writing sense synonyms...", lines.size()); 1189 template <typename... Args> word& generator::createWord(Args&&... args)
2206 for (auto line : lines)
2207 { 1190 {
2208 ppgs.update(); 1191 words_.emplace_back(std::forward<Args>(args)...);
1192 word& w = words_.back();
2209 1193
2210 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); 1194 wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w);
2211 std::smatch relation_data; 1195
2212 if (!std::regex_search(line, relation_data, relation)) 1196 if (w.getNotion().hasWnid())
2213 { 1197 {
2214 continue; 1198 wordsByWnid_[w.getNotion().getWnid()].insert(&w);
2215 } 1199 }
2216 1200
2217 int synset_id_1 = stoi(relation_data[1]); 1201 return w;
2218 int synset_id_2 = stoi(relation_data[2]); 1202 }
2219 std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); 1203
1204 group& generator::createGroup(xmlNodePtr top)
1205 {
1206 groups_.emplace_back();
1207 group& grp = groups_.back();
2220 1208
2221 for (auto mapping1 : wn[synset_id_1]) 1209 xmlChar* key;
1210
1211 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
2222 { 1212 {
2223 for (auto mapping2 : wn[synset_id_2]) 1213 if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES")))
2224 { 1214 {
2225 sqlite3_stmt* ppstmt; 1215 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
2226 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2227 { 1216 {
2228 db_error(ppdb, query); 1217 if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS")))
1218 {
1219 try
1220 {
1221 group& subgrp = createGroup(subclass);
1222 subgrp.setParent(grp);
1223 } catch (const std::exception& e)
1224 {
1225 key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID"));
1226
1227 if (key == nullptr)
1228 {
1229 std::throw_with_nested(std::logic_error("Error parsing IDless subgroup"));
1230 } else {
1231 std::string subgroupId(reinterpret_cast<const char*>(key));
1232 xmlFree(key);
1233
1234 std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId));
1235 }
1236 }
1237 }
2229 } 1238 }
2230 1239 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS")))
2231 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1240 {
2232 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1241 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
2233
2234 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2235 { 1242 {
2236 db_error(ppdb, query); 1243 if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER")))
1244 {
1245 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn"));
1246 std::string wnSenses(reinterpret_cast<const char*>(key));
1247 xmlFree(key);
1248
1249 auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " ");
1250 if (!wnSenseKeys.empty())
1251 {
1252 std::list<std::string> tempKeys;
1253
1254 std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) {
1255 return sense + "::";
1256 });
1257
1258 std::list<std::string> filteredKeys;
1259
1260 std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) {
1261 return !wnSenseKeys_.count(sense);
1262 });
1263
1264 wnSenseKeys = std::move(filteredKeys);
1265 }
1266
1267 if (!wnSenseKeys.empty())
1268 {
1269 for (std::string sense : wnSenseKeys)
1270 {
1271 word& wordSense = *wnSenseKeys_[sense];
1272 wordSense.setVerbGroup(grp);
1273 }
1274 } else {
1275 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name"));
1276 std::string memberName(reinterpret_cast<const char*>(key));
1277 xmlFree(key);
1278
1279 notion& n = createNotion(part_of_speech::verb);
1280 lemma& l = lookupOrCreateLemma(memberName);
1281 word& w = createWord(n, l);
1282
1283 w.setVerbGroup(grp);
1284 }
1285 }
2237 } 1286 }
2238 1287 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES")))
2239 sqlite3_reset(ppstmt); 1288 {
2240 sqlite3_clear_bindings(ppstmt); 1289 for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next)
2241
2242 sqlite3_bind_int(ppstmt, 1, mapping2.second);
2243 sqlite3_bind_int(ppstmt, 2, mapping1.second);
2244
2245 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2246 { 1290 {
2247 db_error(ppdb, query); 1291 if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE")))
1292 {
1293 role r;
1294
1295 key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type"));
1296 std::string roleName = reinterpret_cast<const char*>(key);
1297 xmlFree(key);
1298
1299 for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
1300 {
1301 if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1302 {
1303 r.setSelrestrs(parseSelrestr(rolenode));
1304 }
1305 }
1306
1307 grp.addRole(roleName, std::move(r));
1308 }
2248 } 1309 }
1310 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES")))
1311 {
1312 for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next)
1313 {
1314 if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME")))
1315 {
1316 frames_.emplace_back();
1317 frame& fr = frames_.back();
2249 1318
2250 sqlite3_finalize(ppstmt); 1319 for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
1320 {
1321 if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX")))
1322 {
1323 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
1324 {
1325 if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP")))
1326 {
1327 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1328 std::string partRole = reinterpret_cast<const char*>(key);
1329 xmlFree(key);
1330
1331 selrestr partSelrestrs;
1332 std::set<std::string> partSynrestrs;
1333
1334 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1335 {
1336 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS")))
1337 {
1338 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1339 {
1340 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR")))
1341 {
1342 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1343 partSynrestrs.insert(reinterpret_cast<const char*>(key));
1344 xmlFree(key);
1345 }
1346 }
1347 }
1348
1349 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1350 {
1351 partSelrestrs = parseSelrestr(npnode);
1352 }
1353 }
1354
1355 fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs)));
1356 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB")))
1357 {
1358 fr.push_back(part::createVerb());
1359 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP")))
1360 {
1361 std::set<std::string> partChoices;
1362 bool partLiteral;
1363
1364 if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")))
1365 {
1366 partLiteral = true;
1367
1368 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1369 std::string choicesStr = reinterpret_cast<const char*>(key);
1370 xmlFree(key);
1371
1372 split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices)));
1373 } else {
1374 partLiteral = false;
1375
1376 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1377 {
1378 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1379 {
1380 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1381 {
1382 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1383 {
1384 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1385 partChoices.insert(reinterpret_cast<const char*>(key));
1386 xmlFree(key);
1387 }
1388 }
1389 }
1390 }
1391 }
1392
1393 fr.push_back(part::createPreposition(std::move(partChoices), partLiteral));
1394 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ")))
1395 {
1396 fr.push_back(part::createAdjective());
1397 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV")))
1398 {
1399 fr.push_back(part::createAdverb());
1400 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX")))
1401 {
1402 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1403 std::string literalValue = reinterpret_cast<const char*>(key);
1404 xmlFree(key);
1405
1406 fr.push_back(part::createLiteral(literalValue));
1407 } else {
1408 continue;
1409 }
1410 }
1411
1412 grp.addFrame(fr);
1413 }
1414 }
1415 }
1416 }
2251 } 1417 }
2252 } 1418 }
2253 }
2254 }
2255
2256 // syntax table
2257 {
2258 std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl");
2259 if (!wnsyntaxfile.is_open())
2260 {
2261 std::cout << "Invalid WordNet data directory." << std::endl;
2262 print_usage();
2263 }
2264 1419
2265 std::list<std::string> lines; 1420 return grp;
2266 for (;;)
2267 {
2268 std::string line;
2269 if (!getline(wnsyntaxfile, line))
2270 {
2271 break;
2272 }
2273
2274 if (line.back() == '\r')
2275 {
2276 line.pop_back();
2277 }
2278
2279 lines.push_back(line);
2280 } 1421 }
2281 1422
2282 progress ppgs("Writing adjective syntax markers...", lines.size()); 1423 selrestr generator::parseSelrestr(xmlNodePtr top)
2283 for (auto line : lines)
2284 { 1424 {
2285 ppgs.update(); 1425 xmlChar* key;
2286 1426
2287 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); 1427 if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
2288 std::smatch relation_data;
2289 if (!std::regex_search(line, relation_data, relation))
2290 {
2291 continue;
2292 }
2293
2294 int synset_id = stoi(relation_data[1]);
2295 int wnum = stoi(relation_data[2]);
2296 std::string syn = relation_data[3];
2297 std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?");
2298
2299 sqlite3_stmt* ppstmt;
2300 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2301 { 1428 {
2302 db_error(ppdb, query); 1429 if (xmlChildElementCount(top) == 0)
2303 } 1430 {
2304 1431 return {};
2305 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); 1432 } else if (xmlChildElementCount(top) == 1)
2306 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); 1433 {
2307 1434 return parseSelrestr(xmlFirstElementChild(top));
2308 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1435 } else {
1436 bool orlogic = false;
1437 if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic")))
1438 {
1439 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic"));
1440 if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or")))
1441 {
1442 orlogic = true;
1443 }
1444
1445 xmlFree(key);
1446 }
1447
1448 std::list<selrestr> children;
1449 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next)
1450 {
1451 if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))
1452 || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1453 {
1454 children.push_back(parseSelrestr(selrestr));
1455 }
1456 }
1457
1458 return selrestr(children, orlogic);
1459 }
1460 } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
2309 { 1461 {
2310 db_error(ppdb, query); 1462 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value"));
1463 bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+");
1464 xmlFree(key);
1465
1466 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type"));
1467 std::string selRestriction = reinterpret_cast<const char*>(key);
1468 xmlFree(key);
1469
1470 return selrestr(selRestriction, selPos);
1471 } else {
1472 throw std::logic_error("Badly formatted selrestr");
2311 } 1473 }
2312
2313 sqlite3_finalize(ppstmt);
2314 } 1474 }
2315 } 1475
2316 1476 };
2317 sqlite3_close_v2(ppdb); 1477};
2318
2319 std::cout << "Done." << std::endl;
2320}
diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h
@@ -0,0 +1,151 @@
1#ifndef GENERATOR_H_5B61CBC5
2#define GENERATOR_H_5B61CBC5
3
4#include <string>
5#include <map>
6#include <list>
7#include <set>
8#include <libxml/parser.h>
9#include "database.h"
10#include "notion.h"
11#include "word.h"
12#include "lemma.h"
13#include "form.h"
14#include "pronunciation.h"
15#include "group.h"
16#include "frame.h"
17
18namespace verbly {
19 namespace generator {
20
21 enum class part_of_speech;
22 class selrestr;
23
24 class generator {
25 public:
26
27 // Constructor
28
29 generator(
30 std::string verbNetPath,
31 std::string agidPath,
32 std::string wordNetPath,
33 std::string cmudictPath,
34 std::string imageNetPath,
35 std::string outputPath);
36
37 // Action
38
39 void run();
40
41 private:
42
43 // Subroutines
44
45 void readWordNetSynsets();
46
47 void readAdjectivePositioning();
48
49 void readImageNetUrls();
50
51 void readWordNetSenseKeys();
52
53 void readVerbNet();
54
55 void readAgidInflections();
56
57 void readPrepositions();
58
59 void readCmudictPronunciations();
60
61 void writeSchema();
62
63 void dumpObjects();
64
65 void readWordNetAntonymy();
66
67 void readWordNetVariation();
68
69 void readWordNetClasses();
70
71 void readWordNetCausality();
72
73 void readWordNetEntailment();
74
75 void readWordNetHypernymy();
76
77 void readWordNetInstantiation();
78
79 void readWordNetMemberMeronymy();
80
81 void readWordNetPartMeronymy();
82
83 void readWordNetSubstanceMeronymy();
84
85 void readWordNetPertainymy();
86
87 void readWordNetSpecification();
88
89 void readWordNetSimilarity();
90
91 // Helpers
92
93 std::list<std::string> readFile(std::string path);
94
95 inline part_of_speech partOfSpeechByWnid(int wnid);
96
97 notion& createNotion(part_of_speech partOfSpeech);
98
99 notion& lookupOrCreateNotion(int wnid);
100
101 lemma& lookupOrCreateLemma(std::string base_form);
102
103 form& lookupOrCreateForm(std::string text);
104
105 template <typename... Args> word& createWord(Args&&... args);
106
107 group& createGroup(xmlNodePtr top);
108
109 selrestr parseSelrestr(xmlNodePtr top);
110
111 // Input
112
113 std::string verbNetPath_;
114 std::string agidPath_;
115 std::string wordNetPath_;
116 std::string cmudictPath_;
117 std::string imageNetPath_;
118
119 // Output
120
121 database db_;
122
123 // Data
124
125 std::list<notion> notions_;
126 std::list<word> words_;
127 std::list<lemma> lemmas_;
128 std::list<form> forms_;
129 std::list<pronunciation> pronunciations_;
130 std::list<frame> frames_;
131 std::list<group> groups_;
132
133 // Indexes
134
135 std::map<int, notion*> notionByWnid_;
136 std::map<int, std::set<word*>> wordsByWnid_;
137 std::map<std::pair<int, int>, word*> wordByWnidAndWnum_;
138 std::map<std::string, std::set<word*>> wordsByBaseForm_;
139 std::map<std::string, lemma*> lemmaByBaseForm_;
140 std::map<std::string, form*> formByText_;
141
142 // Caches
143
144 std::map<std::string, word*> wnSenseKeys_;
145
146 };
147
148 };
149};
150
151#endif /* end of include guard: GENERATOR_H_5B61CBC5 */
diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp
@@ -0,0 +1,119 @@
1#include "group.h"
2#include <stdexcept>
3#include <list>
4#include <json.hpp>
5#include "database.h"
6#include "field.h"
7#include "frame.h"
8
9namespace verbly {
10 namespace generator {
11
12 int group::nextId_ = 0;
13
14 group::group() : id_(nextId_++)
15 {
16 }
17
18 void group::setParent(const group& parent)
19 {
20 // Adding a group to itself is nonsensical.
21 assert(&parent != this);
22
23 parent_ = &parent;
24 }
25
26 void group::addRole(std::string name, role r)
27 {
28 roleNames_.insert(name);
29 roles_[name] = std::move(r);
30 }
31
32 void group::addFrame(const frame& f)
33 {
34 frames_.insert(&f);
35 }
36
37 std::set<std::string> group::getRoles() const
38 {
39 std::set<std::string> fullRoles = roleNames_;
40
41 if (hasParent())
42 {
43 for (std::string name : getParent().getRoles())
44 {
45 fullRoles.insert(name);
46 }
47 }
48
49 return fullRoles;
50 }
51
52 const role& group::getRole(std::string name) const
53 {
54 if (roles_.count(name))
55 {
56 return roles_.at(name);
57 } else if (hasParent())
58 {
59 return getParent().getRole(name);
60 } else {
61 throw std::invalid_argument("Specified role not found in verb group");
62 }
63 }
64
65 std::set<const frame*> group::getFrames() const
66 {
67 std::set<const frame*> fullFrames = frames_;
68
69 if (hasParent())
70 {
71 for (const frame* f : getParent().getFrames())
72 {
73 fullFrames.insert(f);
74 }
75 }
76
77 return fullFrames;
78 }
79
80 database& operator<<(database& db, const group& arg)
81 {
82 // Serialize the group first
83 {
84 std::list<field> fields;
85 fields.emplace_back("group_id", arg.getId());
86
87 nlohmann::json jsonRoles;
88 for (std::string name : arg.getRoles())
89 {
90 const role& r = arg.getRole(name);
91
92 nlohmann::json jsonRole;
93 jsonRole["type"] = name;
94 jsonRole["selrestrs"] = r.getSelrestrs().toJson();
95
96 jsonRoles.emplace_back(std::move(jsonRole));
97 }
98
99 fields.emplace_back("data", jsonRoles.dump());
100
101 db.insertIntoTable("groups", std::move(fields));
102 }
103
104 // Then, serialize the group/frame relationship
105 for (const frame* f : arg.getFrames())
106 {
107 std::list<field> fields;
108
109 fields.emplace_back("group_id", arg.getId());
110 fields.emplace_back("frame_id", f->getId());
111
112 db.insertIntoTable("groups_frames", std::move(fields));
113 }
114
115 return db;
116 }
117
118 };
119};
diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h
@@ -0,0 +1,80 @@
1#ifndef GROUP_H_EDAFB5DC
2#define GROUP_H_EDAFB5DC
3
4#include <map>
5#include <set>
6#include <string>
7#include <cassert>
8#include "role.h"
9
10namespace verbly {
11 namespace generator {
12
13 class frame;
14 class database;
15
16 class group {
17 public:
18
19 // Constructor
20
21 group();
22
23 // Mutators
24
25 void setParent(const group& parent);
26
27 void addRole(std::string name, role r);
28
29 void addFrame(const frame& f);
30
31 // Accessors
32
33 int getId() const
34 {
35 return id_;
36 }
37
38 bool hasParent() const
39 {
40 return (parent_ != nullptr);
41 }
42
43 const group& getParent() const
44 {
45 // Calling code should always call hasParent first
46 assert(parent_ != nullptr);
47
48 return *parent_;
49 }
50
51 std::set<std::string> getRoles() const;
52
53 const role& getRole(std::string name) const;
54
55 std::set<const frame*> getFrames() const;
56
57 private:
58
59 static int nextId_;
60
61 const int id_;
62
63 const group* parent_ = nullptr;
64 std::map<std::string, role> roles_;
65 std::set<const frame*> frames_;
66
67 // Caches
68
69 std::set<std::string> roleNames_;
70
71 };
72
73 // Serializer
74
75 database& operator<<(database& db, const group& arg);
76
77 };
78};
79
80#endif /* end of include guard: GROUP_H_EDAFB5DC */
diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp
@@ -0,0 +1,65 @@
1#include "lemma.h"
2#include <list>
3#include <cassert>
4#include "field.h"
5#include "database.h"
6#include "form.h"
7
8namespace verbly {
9 namespace generator {
10
11 int lemma::nextId_ = 0;
12
13 lemma::lemma(const form& baseForm) :
14 id_(nextId_++),
15 baseForm_(baseForm)
16 {
17 inflections_[inflection::base] = {&baseForm};
18 }
19
20 void lemma::addInflection(inflection type, const form& f)
21 {
22 // There can only be one base form.
23 assert(type != inflection::base);
24
25 inflections_[type].insert(&f);
26 }
27
28 std::set<const form*> lemma::getInflections(inflection type) const
29 {
30 if (inflections_.count(type))
31 {
32 return inflections_.at(type);
33 } else {
34 return {};
35 }
36 }
37
38 database& operator<<(database& db, const lemma& arg)
39 {
40 for (inflection type : {
41 inflection::base,
42 inflection::plural,
43 inflection::comparative,
44 inflection::superlative,
45 inflection::past_tense,
46 inflection::past_participle,
47 inflection::ing_form,
48 inflection::s_form})
49 {
50 for (const form* f : arg.getInflections(type))
51 {
52 std::list<field> fields;
53 fields.emplace_back("lemma_id", arg.getId());
54 fields.emplace_back("form_id", f->getId());
55 fields.emplace_back("category", static_cast<int>(type));
56
57 db.insertIntoTable("lemmas_forms", std::move(fields));
58 }
59 }
60
61 return db;
62 }
63
64 };
65};
diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h
@@ -0,0 +1,58 @@
1#ifndef LEMMA_H_D73105A7
2#define LEMMA_H_D73105A7
3
4#include <string>
5#include <map>
6#include <set>
7#include "enums.h"
8
9namespace verbly {
10 namespace generator {
11
12 class database;
13 class form;
14
15 class lemma {
16 public:
17
18 // Constructors
19
20 explicit lemma(const form& baseForm);
21
22 // Mutators
23
24 void addInflection(inflection type, const form& f);
25
26 // Accessors
27
28 int getId() const
29 {
30 return id_;
31 }
32
33 const form& getBaseForm() const
34 {
35 return baseForm_;
36 }
37
38 std::set<const form*> getInflections(inflection type) const;
39
40 private:
41
42 static int nextId_;
43
44 const int id_;
45 const form& baseForm_;
46
47 std::map<inflection, std::set<const form*>> inflections_;
48
49 };
50
51 // Serializer
52
53 database& operator<<(database& db, const lemma& arg);
54
55 };
56};
57
58#endif /* end of include guard: LEMMA_H_D73105A7 */
diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp
@@ -0,0 +1,40 @@
1#include <iostream>
2#include <exception>
3#include "generator.h"
4
5void printUsage()
6{
7 std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl;
8 std::cout << "verbnet :: path to a VerbNet data directory" << std::endl;
9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl;
10 std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl;
11 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl;
12 std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl;
13 std::cout << "output :: datafile output path" << std::endl;
14}
15
16int main(int argc, char** argv)
17{
18 if (argc == 7)
19 {
20 try
21 {
22 verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
23
24 try
25 {
26 app.run();
27 } catch (const std::exception& e)
28 {
29 std::cout << e.what() << std::endl;
30 }
31 } catch (const std::exception& e)
32 {
33 std::cout << e.what() << std::endl;
34 printUsage();
35 }
36 } else {
37 std::cout << "verbly datafile generator" << std::endl;
38 printUsage();
39 }
40}
diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp
@@ -0,0 +1,85 @@
1#include "notion.h"
2#include <string>
3#include <list>
4#include "database.h"
5#include "field.h"
6
7namespace verbly {
8 namespace generator {
9
10 int notion::nextId_ = 0;
11
12 notion::notion(
13 part_of_speech partOfSpeech) :
14 id_(nextId_++),
15 partOfSpeech_(partOfSpeech)
16 {
17 }
18
19 notion::notion(
20 part_of_speech partOfSpeech,
21 int wnid) :
22 id_(nextId_++),
23 partOfSpeech_(partOfSpeech),
24 wnid_(wnid),
25 hasWnid_(true)
26 {
27 }
28
29 void notion::incrementNumOfImages()
30 {
31 // Calling code should always call hasWnid and check that the notion is a noun first.
32 assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun));
33
34 numOfImages_++;
35 }
36
37 void notion::setPrepositionGroups(std::list<std::string> groups)
38 {
39 // Calling code should always check that the notion is a preposition first.
40 assert(partOfSpeech_ == part_of_speech::preposition);
41
42 prepositionGroups_ = groups;
43 }
44
45 database& operator<<(database& db, const notion& arg)
46 {
47 // First, serialize the notion
48 {
49 std::list<field> fields;
50
51 fields.emplace_back("notion_id", arg.getId());
52 fields.emplace_back("part_of_speech", static_cast<int>(arg.getPartOfSpeech()));
53
54 if (arg.hasWnid())
55 {
56 fields.emplace_back("wnid", arg.getWnid());
57
58 if (arg.getPartOfSpeech() == part_of_speech::noun)
59 {
60 fields.emplace_back("images", arg.getNumOfImages());
61 }
62 }
63
64 db.insertIntoTable("notions", std::move(fields));
65 }
66
67 // Next, serialize the is_a relationship if this is a preposition
68 if (arg.getPartOfSpeech() == part_of_speech::preposition)
69 {
70 for (std::string group : arg.getPrepositionGroups())
71 {
72 std::list<field> fields;
73
74 fields.emplace_back("notion_id", arg.getId());
75 fields.emplace_back("groupname", group);
76
77 db.insertIntoTable("is_a", std::move(fields));
78 }
79 }
80
81 return db;
82 }
83
84 };
85};
diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h
@@ -0,0 +1,91 @@
1#ifndef NOTION_H_221DE2BC
2#define NOTION_H_221DE2BC
3
4#include <cassert>
5#include <list>
6#include <string>
7#include "enums.h"
8
9namespace verbly {
10 namespace generator {
11
12 class database;
13
14 class notion {
15 public:
16
17 // Constructors
18
19 explicit notion(part_of_speech partOfSpeech);
20
21 notion(part_of_speech partOfSpeech, int wnid);
22
23 // Mutators
24
25 void incrementNumOfImages();
26
27 void setPrepositionGroups(std::list<std::string> groups);
28
29 // Accessors
30
31 int getId() const
32 {
33 return id_;
34 }
35
36 part_of_speech getPartOfSpeech() const
37 {
38 return partOfSpeech_;
39 }
40
41 bool hasWnid() const
42 {
43 return hasWnid_;
44 }
45
46 int getWnid() const
47 {
48 // Calling code should always call hasWnid first.
49 assert(hasWnid_);
50
51 return wnid_;
52 }
53
54 int getNumOfImages() const
55 {
56 // Calling code should always call hasWnid and check that the notion is a noun first.
57 assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun));
58
59 return numOfImages_;
60 }
61
62 std::list<std::string> getPrepositionGroups() const
63 {
64 // Calling code should always check that the notion is a preposition first.
65 assert(partOfSpeech_ == part_of_speech::preposition);
66
67 return prepositionGroups_;
68 }
69
70 private:
71
72 static int nextId_;
73
74 const int id_;
75 const part_of_speech partOfSpeech_;
76 const int wnid_ = 0;
77 const bool hasWnid_ = false;
78
79 int numOfImages_ = 0;
80 std::list<std::string> prepositionGroups_;
81
82 };
83
84 // Serializer
85
86 database& operator<<(database& db, const notion& arg);
87
88 };
89};
90
91#endif /* end of include guard: NOTION_H_221DE2BC */
diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp
@@ -0,0 +1,336 @@
1#include "part.h"
2#include <stdexcept>
3#include "selrestr.h"
4
5namespace verbly {
6 namespace generator {
7
8 part part::createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs)
9 {
10 part p(type::noun_phrase);
11
12 new(&p.noun_phrase_.role) std::string(std::move(role));
13 new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs));
14 new(&p.noun_phrase_.synrestrs) std::set<std::string>(std::move(synrestrs));
15
16 return p;
17 }
18
19 part part::createVerb()
20 {
21 return part(type::verb);
22 }
23
24 part part::createPreposition(std::set<std::string> choices, bool literal)
25 {
26 part p(type::preposition);
27
28 new(&p.preposition_.choices) std::set<std::string>(std::move(choices));
29 p.preposition_.literal = literal;
30
31 return p;
32 }
33
34 part part::createAdjective()
35 {
36 return part(type::adjective);
37 }
38
39 part part::createAdverb()
40 {
41 return part(type::adverb);
42 }
43
44 part part::createLiteral(std::string value)
45 {
46 part p(type::literal);
47
48 new(&p.literal_) std::string(std::move(value));
49
50 return p;
51 }
52
53 part::part(const part& other)
54 {
55 type_ = other.type_;
56
57 switch (type_)
58 {
59 case type::noun_phrase:
60 {
61 new(&noun_phrase_.role) std::string(other.noun_phrase_.role);
62 new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs);
63 new(&noun_phrase_.synrestrs) std::set<std::string>(other.noun_phrase_.synrestrs);
64
65 break;
66 }
67
68 case type::preposition:
69 {
70 new(&preposition_.choices) std::set<std::string>(other.preposition_.choices);
71 preposition_.literal = other.preposition_.literal;
72
73 break;
74 }
75
76 case type::literal:
77 {
78 new(&literal_) std::string(other.literal_);
79
80 break;
81 }
82
83 case type::verb:
84 case type::adjective:
85 case type::adverb:
86 case type::invalid:
87 {
88 break;
89 }
90 }
91 }
92
93 part::part(part&& other) : part()
94 {
95 swap(*this, other);
96 }
97
98 part& part::operator=(part other)
99 {
100 swap(*this, other);
101
102 return *this;
103 }
104
105 void swap(part& first, part& second)
106 {
107 using type = part::type;
108
109 type tempType = first.type_;
110 std::string tempRole;
111 selrestr tempSelrestrs;
112 std::set<std::string> tempSynrestrs;
113 std::set<std::string> tempChoices;
114 bool tempPrepLiteral;
115 std::string tempLiteralValue;
116
117 switch (tempType)
118 {
119 case type::noun_phrase:
120 {
121 tempRole = std::move(first.noun_phrase_.role);
122 tempSelrestrs = std::move(first.noun_phrase_.selrestrs);
123 tempSynrestrs = std::move(first.noun_phrase_.synrestrs);
124
125 break;
126 }
127
128 case type::preposition:
129 {
130 tempChoices = std::move(first.preposition_.choices);
131 tempPrepLiteral = first.preposition_.literal;
132
133 break;
134 }
135
136 case type::literal:
137 {
138 tempLiteralValue = std::move(first.literal_);
139
140 break;
141 }
142
143 case type::verb:
144 case type::adjective:
145 case type::adverb:
146 case type::invalid:
147 {
148 break;
149 }
150 }
151
152 first.~part();
153
154 first.type_ = second.type_;
155
156 switch (first.type_)
157 {
158 case type::noun_phrase:
159 {
160 new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role));
161 new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs));
162 new(&first.noun_phrase_.synrestrs) std::set<std::string>(std::move(second.noun_phrase_.synrestrs));
163
164 break;
165 }
166
167 case type::preposition:
168 {
169 new(&first.preposition_.choices) std::set<std::string>(std::move(second.preposition_.choices));
170 first.preposition_.literal = second.preposition_.literal;
171
172 break;
173 }
174
175 case type::literal:
176 {
177 new(&first.literal_) std::string(std::move(second.literal_));
178
179 break;
180 }
181
182 case type::verb:
183 case type::adjective:
184 case type::adverb:
185 case type::invalid:
186 {
187 break;
188 }
189 }
190
191 second.~part();
192
193 second.type_ = tempType;
194
195 switch (second.type_)
196 {
197 case type::noun_phrase:
198 {
199 new(&second.noun_phrase_.role) std::string(std::move(tempRole));
200 new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs));
201 new(&second.noun_phrase_.synrestrs) std::set<std::string>(std::move(tempSynrestrs));
202
203 break;
204 }
205
206 case type::preposition:
207 {
208 new(&second.preposition_.choices) std::set<std::string>(std::move(tempChoices));
209 second.preposition_.literal = tempPrepLiteral;
210
211 break;
212 }
213
214 case type::literal:
215 {
216 new(&second.literal_) std::string(std::move(tempLiteralValue));
217
218 break;
219 }
220
221 case type::verb:
222 case type::adjective:
223 case type::adverb:
224 case type::invalid:
225 {
226 break;
227 }
228 }
229 }
230
231 part::~part()
232 {
233 switch (type_)
234 {
235 case type::noun_phrase:
236 {
237 using string_type = std::string;
238 using set_type = std::set<std::string>;
239
240 noun_phrase_.role.~string_type();
241 noun_phrase_.selrestrs.~selrestr();
242 noun_phrase_.synrestrs.~set_type();
243
244 break;
245 }
246
247 case type::preposition:
248 {
249 using set_type = std::set<std::string>;
250
251 preposition_.choices.~set_type();
252
253 break;
254 }
255
256 case type::literal:
257 {
258 using string_type = std::string;
259
260 literal_.~string_type();
261
262 break;
263 }
264
265 case type::verb:
266 case type::adjective:
267 case type::adverb:
268 case type::invalid:
269 {
270 break;
271 }
272 }
273 }
274
275 std::string part::getNounRole() const
276 {
277 if (type_ == type::noun_phrase)
278 {
279 return noun_phrase_.role;
280 } else {
281 throw std::domain_error("part::getNounRole is only valid for noun phrase parts");
282 }
283 }
284
285 selrestr part::getNounSelrestrs() const
286 {
287 if (type_ == type::noun_phrase)
288 {
289 return noun_phrase_.selrestrs;
290 } else {
291 throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts");
292 }
293 }
294
295 std::set<std::string> part::getNounSynrestrs() const
296 {
297 if (type_ == type::noun_phrase)
298 {
299 return noun_phrase_.synrestrs;
300 } else {
301 throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts");
302 }
303 }
304
305 std::set<std::string> part::getPrepositionChoices() const
306 {
307 if (type_ == type::preposition)
308 {
309 return preposition_.choices;
310 } else {
311 throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts");
312 }
313 }
314
315 bool part::isPrepositionLiteral() const
316 {
317 if (type_ == type::preposition)
318 {
319 return preposition_.literal;
320 } else {
321 throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts");
322 }
323 }
324
325 std::string part::getLiteralValue() const
326 {
327 if (type_ == type::literal)
328 {
329 return literal_;
330 } else {
331 throw std::domain_error("part::getLiteralValue is only valid for literal parts");
332 }
333 }
334
335 };
336};
diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h
@@ -0,0 +1,114 @@
1#ifndef PART_H_FB54F361
2#define PART_H_FB54F361
3
4#include <string>
5#include <set>
6#include "selrestr.h"
7
8namespace verbly {
9 namespace generator {
10
11 class part {
12 public:
13 enum class type {
14 invalid = -1,
15 noun_phrase = 0,
16 verb = 1,
17 preposition = 2,
18 adjective = 3,
19 adverb = 4,
20 literal = 5
21 };
22
23 // Static factories
24
25 static part createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs);
26
27 static part createVerb();
28
29 static part createPreposition(std::set<std::string> choices, bool literal);
30
31 static part createAdjective();
32
33 static part createAdverb();
34
35 static part createLiteral(std::string value);
36
37 // Copy and move constructors
38
39 part(const part& other);
40
41 part(part&& other);
42
43 // Assignment
44
45 part& operator=(part other);
46
47 // Swap
48
49 friend void swap(part& first, part& second);
50
51 // Destructor
52
53 ~part();
54
55 // General accessors
56
57 type getType() const
58 {
59 return type_;
60 }
61
62 // Noun phrase accessors
63
64 std::string getNounRole() const;
65
66 selrestr getNounSelrestrs() const;
67
68 std::set<std::string> getNounSynrestrs() const;
69
70 // Preposition accessors
71
72 std::set<std::string> getPrepositionChoices() const;
73
74 bool isPrepositionLiteral() const;
75
76 // Literal accessors
77
78 std::string getLiteralValue() const;
79
80 private:
81
82 // Private constructors
83
84 part()
85 {
86 }
87
88 part(type t) : type_(t)
89 {
90 }
91
92 // Data
93
94 union {
95 struct {
96 std::string role;
97 selrestr selrestrs;
98 std::set<std::string> synrestrs;
99 } noun_phrase_;
100 struct {
101 std::set<std::string> choices;
102 bool literal;
103 } preposition_;
104 std::string literal_;
105 };
106
107 type type_ = type::invalid;
108
109 };
110
111 };
112};
113
114#endif /* end of include guard: PART_H_FB54F361 */
diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h
@@ -3,48 +3,54 @@
3 3
4#include <string> 4#include <string>
5 5
6class progress { 6namespace verbly {
7 private: 7 namespace generator {
8 std::string message;
9 int total;
10 int cur = 0;
11 int lprint = 0;
12 8
13 public: 9 class progress {
14 progress(std::string message, int total) : message(message), total(total) 10 private:
15 { 11 std::string message;
16 std::cout << message << " 0%" << std::flush; 12 int total;
17 } 13 int cur = 0;
14 int lprint = 0;
18 15
19 void update(int val) 16 public:
20 { 17 progress(std::string message, int total) : message(message), total(total)
21 if (val <= total) 18 {
22 { 19 std::cout << message << " 0%" << std::flush;
23 cur = val; 20 }
24 } else { 21
25 cur = total; 22 void update(int val)
26 } 23 {
24 if (val <= total)
25 {
26 cur = val;
27 } else {
28 cur = total;
29 }
27 30
28 int pp = cur * 100 / total; 31 int pp = cur * 100 / total;
29 if (pp != lprint) 32 if (pp != lprint)
30 { 33 {
31 lprint = pp; 34 lprint = pp;
32 35
33 std::cout << "\b\b\b\b" << std::right; 36 std::cout << "\b\b\b\b" << std::right;
34 std::cout.width(3); 37 std::cout.width(3);
35 std::cout << pp << "%" << std::flush; 38 std::cout << pp << "%" << std::flush;
36 } 39 }
37 } 40 }
41
42 void update()
43 {
44 update(cur+1);
45 }
38 46
39 void update() 47 ~progress()
40 { 48 {
41 update(cur+1); 49 std::cout << "\b\b\b\b100%" << std::endl;
42 } 50 }
51 };
43 52
44 ~progress() 53 };
45 {
46 std::cout << "\b\b\b\b100%" << std::endl;
47 }
48}; 54};
49 55
50#endif /* end of include guard: PROGRESS_H_A34EF856 */ 56#endif /* end of include guard: PROGRESS_H_A34EF856 */
diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp
@@ -0,0 +1,87 @@
1#include "pronunciation.h"
2#include <list>
3#include <algorithm>
4#include <cctype>
5#include <iterator>
6#include "database.h"
7#include "field.h"
8#include "../lib/util.h"
9
10namespace verbly {
11 namespace generator {
12
13 int pronunciation::nextId_ = 0;
14
15 pronunciation::pronunciation(std::string phonemes) :
16 id_(nextId_++),
17 phonemes_(phonemes)
18 {
19 auto phonemeList = split<std::list<std::string>>(phonemes, " ");
20
21 auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) {
22 return phoneme.find("1") != std::string::npos;
23 });
24
25 // Rhyme detection
26 if (rhymeStart != std::end(phonemeList))
27 {
28 std::list<std::string> rhymePhonemes;
29
30 std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) {
31 std::string naked;
32
33 std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) {
34 return std::isdigit(ch);
35 });
36
37 return naked;
38 });
39
40 rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " ");
41
42 if (rhymeStart != std::begin(phonemeList))
43 {
44 prerhyme_ = *std::prev(rhymeStart);
45 }
46 }
47
48 // Syllable/stress
49 for (std::string phoneme : phonemeList)
50 {
51 if (std::isdigit(phoneme.back()))
52 {
53 // It's a vowel!
54 syllables_++;
55
56 if (phoneme.back() == '1')
57 {
58 stress_.push_back('1');
59 } else {
60 stress_.push_back('0');
61 }
62 }
63 }
64 }
65
66 database& operator<<(database& db, const pronunciation& arg)
67 {
68 std::list<field> fields;
69
70 fields.emplace_back("pronunciation_id", arg.getId());
71 fields.emplace_back("phonemes", arg.getPhonemes());
72 fields.emplace_back("syllables", arg.getSyllables());
73 fields.emplace_back("stress", arg.getStress());
74
75 if (arg.hasRhyme())
76 {
77 fields.emplace_back("rhyme", arg.getRhymePhonemes());
78 fields.emplace_back("prerhyme", arg.getPrerhyme());
79 }
80
81 db.insertIntoTable("pronunciations", std::move(fields));
82
83 return db;
84 }
85
86 };
87};
diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h
@@ -0,0 +1,82 @@
1#ifndef PRONUNCIATION_H_584A08DD
2#define PRONUNCIATION_H_584A08DD
3
4#include <string>
5#include <cassert>
6
7namespace verbly {
8 namespace generator {
9
10 class database;
11
12 class pronunciation {
13 public:
14
15 // Constructor
16
17 explicit pronunciation(std::string phonemes);
18
19 // Accessors
20
21 int getId() const
22 {
23 return id_;
24 }
25
26 std::string getPhonemes() const
27 {
28 return phonemes_;
29 }
30
31 bool hasRhyme() const
32 {
33 return !rhyme_.empty();
34 }
35
36 std::string getRhymePhonemes() const
37 {
38 // Calling code should always call hasRhyme first.
39 assert(!rhyme_.empty());
40
41 return rhyme_;
42 }
43
44 std::string getPrerhyme() const
45 {
46 // Calling code should always call hasRhyme first.
47 assert(!rhyme_.empty());
48
49 return prerhyme_;
50 }
51
52 int getSyllables() const
53 {
54 return syllables_;
55 }
56
57 std::string getStress() const
58 {
59 return stress_;
60 }
61
62 private:
63
64 static int nextId_;
65
66 const int id_;
67 const std::string phonemes_;
68 std::string rhyme_;
69 std::string prerhyme_;
70 int syllables_ = 0;
71 std::string stress_;
72
73 };
74
75 // Serializer
76
77 database& operator<<(database& db, const pronunciation& arg);
78
79 };
80};
81
82#endif /* end of include guard: PRONUNCIATION_H_584A08DD */
diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h
@@ -0,0 +1,35 @@
1#ifndef ROLE_H_249F9A9C
2#define ROLE_H_249F9A9C
3
4#include "selrestr.h"
5
6namespace verbly {
7 namespace generator {
8
9 class role {
10 public:
11
12 // Mutators
13
14 void setSelrestrs(selrestr selrestrs)
15 {
16 selrestrs_ = selrestrs;
17 }
18
19 // Accessors
20
21 const selrestr& getSelrestrs() const
22 {
23 return selrestrs_;
24 }
25
26 private:
27
28 selrestr selrestrs_;
29
30 };
31
32 };
33};
34
35#endif /* end of include guard: ROLE_H_249F9A9C */
diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql
@@ -1,286 +1,204 @@
1DROP TABLE IF EXISTS `verbs`; 1CREATE TABLE `notions` (
2CREATE TABLE `verbs` ( 2 `notion_id` INTEGER PRIMARY KEY,
3 `verb_id` INTEGER PRIMARY KEY, 3 `part_of_speech` SMALLINT NOT NULL,
4 `infinitive` VARCHAR(32) NOT NULL, 4 `wnid` INTEGER,
5 `past_tense` VARCHAR(32) NOT NULL, 5 `images` INTEGER
6 `past_participle` VARCHAR(32) NOT NULL,
7 `ing_form` VARCHAR(32) NOT NULL,
8 `s_form` VARCHAR(32) NOT NULL
9); 6);
10 7
11DROP TABLE IF EXISTS `groups`; 8CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`);
12CREATE TABLE `groups` (
13 `group_id` INTEGER PRIMARY KEY,
14 `data` BLOB NOT NULL
15);
16
17DROP TABLE IF EXISTS `frames`;
18CREATE TABLE `frames` (
19 `frame_id` INTEGER PRIMARY KEY,
20 `group_id` INTEGER NOT NULL,
21 `data` BLOB NOT NULL,
22 FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`)
23);
24 9
25DROP TABLE IF EXISTS `verb_groups`;
26CREATE TABLE `verb_groups` (
27 `verb_id` INTEGER NOT NULL,
28 `group_id` INTEGER NOT NULL,
29 FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`),
30 FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`)
31);
32
33DROP TABLE IF EXISTS `adjectives`;
34CREATE TABLE `adjectives` (
35 `adjective_id` INTEGER PRIMARY KEY,
36 `base_form` VARCHAR(32) NOT NULL,
37 `comparative` VARCHAR(32),
38 `superlative` VARCHAR(32),
39 `position` CHAR(1),
40 `complexity` INTEGER NOT NULL
41);
42
43DROP TABLE IF EXISTS `adverbs`;
44CREATE TABLE `adverbs` (
45 `adverb_id` INTEGER PRIMARY KEY,
46 `base_form` VARCHAR(32) NOT NULL,
47 `comparative` VARCHAR(32),
48 `superlative` VARCHAR(32),
49 `complexity` INTEGER NOT NULL
50);
51
52DROP TABLE IF EXISTS `nouns`;
53CREATE TABLE `nouns` (
54 `noun_id` INTEGER PRIMARY KEY,
55 `singular` VARCHAR(32) NOT NULL,
56 `plural` VARCHAR(32),
57 `proper` INTEGER(1) NOT NULL,
58 `complexity` INTEGER NOT NULL,
59 `images` INTEGER NOT NULL,
60 `wnid` INTEGER NOT NULL
61);
62
63DROP TABLE IF EXISTS `hypernymy`;
64CREATE TABLE `hypernymy` ( 10CREATE TABLE `hypernymy` (
65 `hypernym_id` INTEGER NOT NULL, 11 `hypernym_id` INTEGER NOT NULL,
66 `hyponym_id` INTEGER NOT NULL, 12 `hyponym_id` INTEGER NOT NULL
67 FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`),
68 FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`)
69); 13);
70 14
71DROP TABLE IF EXISTS `instantiation`; 15CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`);
16CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`);
17
72CREATE TABLE `instantiation` ( 18CREATE TABLE `instantiation` (
73 `class_id` INTEGER NOT NULL, 19 `class_id` INTEGER NOT NULL,
74 `instance_id` INTEGER NOT NULL, 20 `instance_id` INTEGER NOT NULL
75 FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`),
76 FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`)
77); 21);
78 22
79DROP TABLE IF EXISTS `member_meronymy`; 23CREATE INDEX `instance_of` ON `instantiation`(`class_id`);
24CREATE INDEX `class_of` ON `instantiation`(`instance_id`);
25
80CREATE TABLE `member_meronymy` ( 26CREATE TABLE `member_meronymy` (
81 `meronym_id` INTEGER NOT NULL, 27 `meronym_id` INTEGER NOT NULL,
82 `holonym_id` INTEGER NOT NULL, 28 `holonym_id` INTEGER NOT NULL
83 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
84 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
85); 29);
86 30
87DROP TABLE IF EXISTS `part_meronymy`; 31CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`);
32CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`);
33
88CREATE TABLE `part_meronymy` ( 34CREATE TABLE `part_meronymy` (
89 `meronym_id` INTEGER NOT NULL, 35 `meronym_id` INTEGER NOT NULL,
90 `holonym_id` INTEGER NOT NULL, 36 `holonym_id` INTEGER NOT NULL
91 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
92 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
93); 37);
94 38
95DROP TABLE IF EXISTS `substance_meronymy`; 39CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`);
40CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`);
41
96CREATE TABLE `substance_meronymy` ( 42CREATE TABLE `substance_meronymy` (
97 `meronym_id` INTEGER NOT NULL, 43 `meronym_id` INTEGER NOT NULL,
98 `holonym_id` INTEGER NOT NULL, 44 `holonym_id` INTEGER NOT NULL
99 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
100 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
101); 45);
102 46
103DROP TABLE IF EXISTS `variation`; 47CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`);
48CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`);
49
104CREATE TABLE `variation` ( 50CREATE TABLE `variation` (
105 `noun_id` INTEGER NOT NULL, 51 `noun_id` INTEGER NOT NULL,
106 `adjective_id` INTEGER NOT NULL, 52 `adjective_id` INTEGER NOT NULL
107 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
108 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
109); 53);
110 54
111DROP TABLE IF EXISTS `noun_antonymy`; 55CREATE INDEX `variant_of` ON `variation`(`noun_id`);
112CREATE TABLE `noun_antonymy` ( 56CREATE INDEX `attribute_of` ON `variation`(`adjective_id`);
113 `noun_1_id` INTEGER NOT NULL,
114 `noun_2_id` INTEGER NOT NULL,
115 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`),
116 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`)
117);
118 57
119DROP TABLE IF EXISTS `adjective_antonymy`; 58CREATE TABLE `similarity` (
120CREATE TABLE `adjective_antonymy` (
121 `adjective_1_id` INTEGER NOT NULL, 59 `adjective_1_id` INTEGER NOT NULL,
122 `adjective_2_id` INTEGER NOT NULL, 60 `adjective_2_id` INTEGER NOT NULL
123 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), 61);
124 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) 62
63CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`);
64
65CREATE TABLE `is_a` (
66 `notion_id` INTEGER NOT NULL,
67 `groupname` VARCHAR(32) NOT NULL
125); 68);
126 69
127DROP TABLE IF EXISTS `adverb_antonymy`; 70CREATE TABLE `entailment` (
128CREATE TABLE `adverb_antonymy` ( 71 `given_id` INTEGER NOT NULL,
129 `adverb_1_id` INTEGER NOT NULL, 72 `entailment_id` INTEGER NOT NULL
130 `adverb_2_id` INTEGER NOT NULL, 73);
131 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), 74
132 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) 75CREATE INDEX `entailment_of` ON `entailment`(`given_id`);
76CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`);
77
78CREATE TABLE `causality` (
79 `cause_id` INTEGER NOT NULL,
80 `effect_id` INTEGER NOT NULL
81);
82
83CREATE INDEX `effect_of` ON `causality`(`cause_id`);
84CREATE INDEX `cause_of` ON `causality`(`effect_id`);
85
86CREATE TABLE `words` (
87 `word_id` INTEGER PRIMARY KEY,
88 `notion_id` INTEGER NOT NULL,
89 `lemma_id` INTEGER NOT NULL,
90 `tag_count` INTEGER,
91 `position` SMALLINT,
92 `group_id` INTEGER
93);
94
95CREATE INDEX `notion_words` ON `words`(`notion_id`);
96CREATE INDEX `lemma_words` ON `words`(`lemma_id`);
97CREATE INDEX `group_words` ON `words`(`group_id`);
98
99CREATE TABLE `antonymy` (
100 `antonym_1_id` INTEGER NOT NULL,
101 `antonym_2_id` INTEGER NOT NULL
133); 102);
134 103
135DROP TABLE IF EXISTS `specification`; 104CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`);
105
136CREATE TABLE `specification` ( 106CREATE TABLE `specification` (
137 `general_id` INTEGER NOT NULL, 107 `general_id` INTEGER NOT NULL,
138 `specific_id` INTEGER NOT NULL, 108 `specific_id` INTEGER NOT NULL
139 FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`),
140 FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`)
141); 109);
142 110
143DROP TABLE IF EXISTS `pertainymy`; 111CREATE INDEX `specification_of` ON `specification`(`general_id`);
112CREATE INDEX `generalization_of` ON `specification`(`specific_id`);
113
144CREATE TABLE `pertainymy` ( 114CREATE TABLE `pertainymy` (
145 `noun_id` INTEGER NOT NULL, 115 `noun_id` INTEGER NOT NULL,
146 `pertainym_id` INTEGER NOT NULL, 116 `pertainym_id` INTEGER NOT NULL
147 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
148 FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`)
149); 117);
150 118
151DROP TABLE IF EXISTS `mannernymy`; 119CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`);
120CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`);
121
152CREATE TABLE `mannernymy` ( 122CREATE TABLE `mannernymy` (
153 `adjective_id` INTEGER NOT NULL, 123 `adjective_id` INTEGER NOT NULL,
154 `mannernym_id` INTEGER NOT NULL, 124 `mannernym_id` INTEGER NOT NULL
155 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`),
156 FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`)
157); 125);
158 126
159DROP TABLE IF EXISTS `noun_synonymy`; 127CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`);
160CREATE TABLE `noun_synonymy` ( 128CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`);
161 `noun_1_id` INTEGER NOT NULL,
162 `noun_2_id` INTEGER NOT NULL,
163 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`),
164 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`)
165);
166 129
167DROP TABLE IF EXISTS `adjective_synonymy`; 130CREATE TABLE `usage` (
168CREATE TABLE `adjective_synonymy` ( 131 `domain_id` INTEGER NOT NULL,
169 `adjective_1_id` INTEGER NOT NULL, 132 `term_id` INTEGER NOT NULL
170 `adjective_2_id` INTEGER NOT NULL,
171 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`),
172 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`)
173); 133);
174 134
175DROP TABLE IF EXISTS `adverb_synonymy`; 135CREATE INDEX `usage_term_of` ON `usage`(`domain_id`);
176CREATE TABLE `adverb_synonymy` ( 136CREATE INDEX `usage_domain_of` ON `usage`(`term_id`);
177 `adverb_1_id` INTEGER NOT NULL,
178 `adverb_2_id` INTEGER NOT NULL,
179 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`),
180 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`)
181);
182 137
183DROP TABLE IF EXISTS `noun_pronunciations`; 138CREATE TABLE `topicality` (
184CREATE TABLE `noun_pronunciations` ( 139 `domain_id` INTEGER NOT NULL,
185 `noun_id` INTEGER NOT NULL, 140 `term_id` INTEGER NOT NULL
186 `pronunciation` VARCHAR(64) NOT NULL,
187 `prerhyme` VARCHAR(8),
188 `rhyme` VARCHAR(64),
189 `syllables` INT NOT NULL,
190 `stress` VARCHAR(64) NOT NULL,
191 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`)
192); 141);
193 142
194DROP TABLE IF EXISTS `verb_pronunciations`; 143CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`);
195CREATE TABLE `verb_pronunciations` ( 144CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`);
196 `verb_id` INTEGER NOT NULL,
197 `pronunciation` VARCHAR(64) NOT NULL,
198 `prerhyme` VARCHAR(8),
199 `rhyme` VARCHAR(64),
200 `syllables` INT NOT NULL,
201 `stress` VARCHAR(64) NOT NULL,
202 FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`)
203);
204 145
205DROP TABLE IF EXISTS `adjective_pronunciations`; 146CREATE TABLE `regionality` (
206CREATE TABLE `adjective_pronunciations` ( 147 `domain_id` INTEGER NOT NULL,
207 `adjective_id` INTEGER NOT NULL, 148 `term_id` INTEGER NOT NULL
208 `pronunciation` VARCHAR(64) NOT NULL,
209 `prerhyme` VARCHAR(8),
210 `rhyme` VARCHAR(64),
211 `syllables` INT NOT NULL,
212 `stress` VARCHAR(64) NOT NULL,
213 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
214); 149);
215 150
216DROP TABLE IF EXISTS `adverb_pronunciations`; 151CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`);
217CREATE TABLE `adverb_pronunciations` ( 152CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`);
218 `adverb_id` INTEGER NOT NULL,
219 `pronunciation` VARCHAR(64) NOT NULL,
220 `prerhyme` VARCHAR(8),
221 `rhyme` VARCHAR(64),
222 `syllables` INT NOT NULL,
223 `stress` VARCHAR(64) NOT NULL,
224 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`)
225);
226 153
227DROP TABLE IF EXISTS `noun_noun_derivation`; 154CREATE TABLE `forms` (
228CREATE TABLE `noun_noun_derivation` ( 155 `form_id` INTEGER PRIMARY KEY,
229 `noun_1_id` INTEGER NOT NULL, 156 `form` VARCHAR(32) NOT NULL,
230 `noun_2_id` INTEGER NOT NULL, 157 `complexity` SMALLINT NOT NULL,
231 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), 158 `proper` SMALLINT NOT NULL
232 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`)
233); 159);
234 160
235DROP TABLE IF EXISTS `noun_adjective_derivation`; 161CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`);
236CREATE TABLE `noun_adjective_derivation` (
237 `noun_id` INTEGER NOT NULL,
238 `adjective_id` INTEGER NOT NULL,
239 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
240 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
241);
242 162
243DROP TABLE IF EXISTS `noun_adverb_derivation`; 163CREATE TABLE `lemmas_forms` (
244CREATE TABLE `noun_adverb_derivation` ( 164 `lemma_id` INTEGER NOT NULL,
245 `noun_id` INTEGER NOT NULL, 165 `form_id` INTEGER NOT NULL,
246 `adverb_id` INTEGER NOT NULL, 166 `category` SMALLINT NOT NULL
247 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
248 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`)
249); 167);
250 168
251DROP TABLE IF EXISTS `adjective_adjective_derivation`; 169CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`);
252CREATE TABLE `adjective_adjective_derivation` ( 170CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`);
253 `adjective_1_id` INTEGER NOT NULL, 171
254 `adjective_2_id` INTEGER NOT NULL, 172CREATE TABLE `pronunciations` (
255 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), 173 `pronunciation_id` INTEGER PRIMARY KEY,
256 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) 174 `phonemes` VARCHAR(64) NOT NULL,
175 `prerhyme` VARCHAR(8),
176 `rhyme` VARCHAR(64),
177 `syllables` INTEGER NOT NULL,
178 `stress` VARCHAR(64) NOT NULL
257); 179);
258 180
259DROP TABLE IF EXISTS `adjective_adverb_derivation`; 181CREATE TABLE `forms_pronunciations` (
260CREATE TABLE `adjective_adverb_derivation` ( 182 `form_id` INTEGER NOT NULL,
261 `adjective_id` INTEGER NOT NULL, 183 `pronunciation_id` INTEGER NOT NULL
262 `adverb_id` INTEGER NOT NULL,
263 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`),
264 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`)
265); 184);
266 185
267DROP TABLE IF EXISTS `adverb_adverb_derivation`; 186CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`);
268CREATE TABLE `adverb_adverb_derivation` ( 187CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`);
269 `adverb_1_id` INTEGER NOT NULL, 188
270 `adverb_2_id` INTEGER NOT NULL, 189CREATE TABLE `groups` (
271 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), 190 `group_id` INTEGER PRIMARY KEY,
272 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) 191 `data` BLOB NOT NULL
273); 192);
274 193
275DROP TABLE IF EXISTS `prepositions`; 194CREATE TABLE `frames` (
276CREATE TABLE `prepositions` ( 195 `frame_id` INTEGER PRIMARY KEY,
277 `preposition_id` INTEGER PRIMARY KEY, 196 `data` BLOB NOT NULL
278 `form` VARCHAR(32) NOT NULL
279); 197);
280 198
281DROP TABLE IF EXISTS `preposition_groups`; 199CREATE TABLE `groups_frames` (
282CREATE TABLE `preposition_groups` ( 200 `group_id` INTEGER NOT NULL,
283 `preposition_id` INTEGER NOT NULL, 201 `frame_id` INTEGER NOT NULL
284 `groupname` VARCHAR(32) NOT NULL,
285 FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`)
286); 202);
203
204CREATE INDEX `frames_in` ON `groups_frames`(`group_id`);
diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp
@@ -0,0 +1,288 @@
1#include "selrestr.h"
2
3namespace verbly {
4 namespace generator {
5
6 selrestr::selrestr(const selrestr& other)
7 {
8 type_ = other.type_;
9
10 switch (type_)
11 {
12 case type::singleton:
13 {
14 singleton_.pos = other.singleton_.pos;
15 new(&singleton_.restriction) std::string(other.singleton_.restriction);
16
17 break;
18 }
19
20 case type::group:
21 {
22 new(&group_.children) std::list<selrestr>(other.group_.children);
23 group_.orlogic = other.group_.orlogic;
24
25 break;
26 }
27
28 case type::empty:
29 {
30 break;
31 }
32 }
33 }
34
35 selrestr::selrestr(selrestr&& other) : selrestr()
36 {
37 swap(*this, other);
38 }
39
40 selrestr& selrestr::operator=(selrestr other)
41 {
42 swap(*this, other);
43
44 return *this;
45 }
46
47 void swap(selrestr& first, selrestr& second)
48 {
49 using type = selrestr::type;
50
51 type tempType = first.type_;
52 int tempPos;
53 std::string tempRestriction;
54 std::list<selrestr> tempChildren;
55 bool tempOrlogic;
56
57 switch (tempType)
58 {
59 case type::singleton:
60 {
61 tempPos = first.singleton_.pos;
62 tempRestriction = std::move(first.singleton_.restriction);
63
64 break;
65 }
66
67 case type::group:
68 {
69 tempChildren = std::move(first.group_.children);
70 tempOrlogic = first.group_.orlogic;
71
72 break;
73 }
74
75 case type::empty:
76 {
77 break;
78 }
79 }
80
81 first.~selrestr();
82
83 first.type_ = second.type_;
84
85 switch (first.type_)
86 {
87 case type::singleton:
88 {
89 first.singleton_.pos = second.singleton_.pos;
90 new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction));
91
92 break;
93 }
94
95 case type::group:
96 {
97 new(&first.group_.children) std::list<selrestr>(std::move(second.group_.children));
98 first.group_.orlogic = second.group_.orlogic;
99
100 break;
101 }
102
103 case type::empty:
104 {
105 break;
106 }
107 }
108
109 second.~selrestr();
110
111 second.type_ = tempType;
112
113 switch (second.type_)
114 {
115 case type::singleton:
116 {
117 second.singleton_.pos = tempPos;
118 new(&second.singleton_.restriction) std::string(std::move(tempRestriction));
119
120 break;
121 }
122
123 case type::group:
124 {
125 new(&second.group_.children) std::list<selrestr>(std::move(tempChildren));
126 second.group_.orlogic = tempOrlogic;
127
128 break;
129 }
130
131 case type::empty:
132 {
133 break;
134 }
135 }
136 }
137
138 selrestr::~selrestr()
139 {
140 switch (type_)
141 {
142 case type::singleton:
143 {
144 using string_type = std::string;
145 singleton_.restriction.~string_type();
146
147 break;
148 }
149
150 case type::group:
151 {
152 using list_type = std::list<selrestr>;
153 group_.children.~list_type();
154
155 break;
156 }
157
158 case type::empty:
159 {
160 break;
161 }
162 }
163 }
164
165 selrestr::selrestr() : type_(type::empty)
166 {
167 }
168
169 selrestr::selrestr(
170 std::string restriction,
171 bool pos) :
172 type_(type::singleton)
173 {
174 new(&singleton_.restriction) std::string(std::move(restriction));
175 singleton_.pos = pos;
176 }
177
178 std::string selrestr::getRestriction() const
179 {
180 if (type_ == type::singleton)
181 {
182 return singleton_.restriction;
183 } else {
184 throw std::domain_error("Only singleton selrestrs have restrictions");
185 }
186 }
187
188 bool selrestr::getPos() const
189 {
190 if (type_ == type::singleton)
191 {
192 return singleton_.pos;
193 } else {
194 throw std::domain_error("Only singleton selrestrs have positivity flags");
195 }
196 }
197
198 selrestr::selrestr(
199 std::list<selrestr> children,
200 bool orlogic) :
201 type_(type::group)
202 {
203 new(&group_.children) std::list<selrestr>(std::move(children));
204 group_.orlogic = orlogic;
205 }
206
207 std::list<selrestr> selrestr::getChildren() const
208 {
209 if (type_ == type::group)
210 {
211 return group_.children;
212 } else {
213 throw std::domain_error("Only group selrestrs have children");
214 }
215 }
216
217 std::list<selrestr>::const_iterator selrestr::begin() const
218 {
219 if (type_ == type::group)
220 {
221 return std::begin(group_.children);
222 } else {
223 throw std::domain_error("Only group selrestrs have children");
224 }
225 }
226
227 std::list<selrestr>::const_iterator selrestr::end() const
228 {
229 if (type_ == type::group)
230 {
231 return std::end(group_.children);
232 } else {
233 throw std::domain_error("Only group selrestrs have children");
234 }
235 }
236
237 bool selrestr::getOrlogic() const
238 {
239 if (type_ == type::group)
240 {
241 return group_.orlogic;
242 } else {
243 throw std::domain_error("Only group selrestrs have logic");
244 }
245 }
246
247 nlohmann::json selrestr::toJson() const
248 {
249 switch (type_)
250 {
251 case type::empty:
252 {
253 return {};
254 }
255
256 case type::singleton:
257 {
258 return {
259 {"type", singleton_.restriction},
260 {"pos", singleton_.pos}
261 };
262 }
263
264 case type::group:
265 {
266 std::string logic;
267 if (group_.orlogic)
268 {
269 logic = "or";
270 } else {
271 logic = "and";
272 }
273
274 std::list<nlohmann::json> children;
275 std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) {
276 return child.toJson();
277 });
278
279 return {
280 {"logic", logic},
281 {"children", children}
282 };
283 }
284 }
285 }
286
287 };
288};
diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h
@@ -0,0 +1,88 @@
1#ifndef SELRESTR_H_50652FB7
2#define SELRESTR_H_50652FB7
3
4#include <list>
5#include <string>
6#include <json.hpp>
7
8namespace verbly {
9 namespace generator {
10
11 class selrestr {
12 public:
13 enum class type {
14 empty,
15 singleton,
16 group
17 };
18
19 // Copy and move constructors
20
21 selrestr(const selrestr& other);
22 selrestr(selrestr&& other);
23
24 // Assignment
25
26 selrestr& operator=(selrestr other);
27
28 // Swap
29
30 friend void swap(selrestr& first, selrestr& second);
31
32 // Destructor
33
34 ~selrestr();
35
36 // Generic accessors
37
38 type getType() const
39 {
40 return type_;
41 }
42
43 // Empty
44
45 selrestr();
46
47 // Singleton
48
49 selrestr(std::string restriction, bool pos);
50
51 std::string getRestriction() const;
52
53 bool getPos() const;
54
55 // Group
56
57 selrestr(std::list<selrestr> children, bool orlogic);
58
59 std::list<selrestr> getChildren() const;
60
61 std::list<selrestr>::const_iterator begin() const;
62
63 std::list<selrestr>::const_iterator end() const;
64
65 bool getOrlogic() const;
66
67 // Helpers
68
69 nlohmann::json toJson() const;
70
71 private:
72 union {
73 struct {
74 bool pos;
75 std::string restriction;
76 } singleton_;
77 struct {
78 std::list<selrestr> children;
79 bool orlogic;
80 } group_;
81 };
82 type type_;
83 };
84
85 };
86};
87
88#endif /* end of include guard: SELRESTR_H_50652FB7 */
diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp
@@ -0,0 +1,77 @@
1#include "word.h"
2#include <list>
3#include <string>
4#include "database.h"
5#include "notion.h"
6#include "lemma.h"
7#include "field.h"
8#include "group.h"
9
10namespace verbly {
11 namespace generator {
12
13 int word::nextId_ = 0;
14
15 word::word(
16 notion& n,
17 lemma& l) :
18 id_(nextId_++),
19 notion_(n),
20 lemma_(l)
21 {
22 }
23
24 word::word(
25 notion& n,
26 lemma& l,
27 int tagCount) :
28 id_(nextId_++),
29 notion_(n),
30 lemma_(l),
31 tagCount_(tagCount),
32 hasTagCount_(true)
33 {
34 }
35
36 void word::setAdjectivePosition(positioning adjectivePosition)
37 {
38 adjectivePosition_ = adjectivePosition;
39 }
40
41 void word::setVerbGroup(const group& verbGroup)
42 {
43 verbGroup_ = &verbGroup;
44 }
45
46 database& operator<<(database& db, const word& arg)
47 {
48 std::list<field> fields;
49
50 fields.emplace_back("word_id", arg.getId());
51 fields.emplace_back("notion_id", arg.getNotion().getId());
52 fields.emplace_back("lemma_id", arg.getLemma().getId());
53
54 if (arg.hasTagCount())
55 {
56 fields.emplace_back("tag_count", arg.getTagCount());
57 }
58
59 if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective)
60 && (arg.getAdjectivePosition() != positioning::undefined))
61 {
62 fields.emplace_back("position", static_cast<int>(arg.getAdjectivePosition()));
63 }
64
65 if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb)
66 && (arg.hasVerbGroup()))
67 {
68 fields.emplace_back("group_id", arg.getVerbGroup().getId());
69 }
70
71 db.insertIntoTable("words", std::move(fields));
72
73 return db;
74 }
75
76 };
77};
diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h
@@ -0,0 +1,110 @@
1#ifndef WORD_H_91F99D46
2#define WORD_H_91F99D46
3
4#include <cassert>
5#include "enums.h"
6
7namespace verbly {
8 namespace generator {
9
10 class notion;
11 class lemma;
12 class database;
13 class group;
14
15 class word {
16 public:
17
18 // Constructors
19
20 word(notion& n, lemma& l);
21
22 word(notion& n, lemma& l, int tagCount);
23
24 // Mutators
25
26 void setAdjectivePosition(positioning adjectivePosition);
27
28 void setVerbGroup(const group& verbGroup);
29
30 // Accessors
31
32 int getId() const
33 {
34 return id_;
35 }
36
37 notion& getNotion()
38 {
39 return notion_;
40 }
41
42 const notion& getNotion() const
43 {
44 return notion_;
45 }
46
47 lemma& getLemma()
48 {
49 return lemma_;
50 }
51
52 const lemma& getLemma() const
53 {
54 return lemma_;
55 }
56
57 bool hasTagCount() const
58 {
59 return hasTagCount_;
60 }
61
62 int getTagCount() const
63 {
64 // Calling code should always call hasTagCount first.
65 assert(hasTagCount_);
66
67 return tagCount_;
68 }
69
70 positioning getAdjectivePosition() const
71 {
72 return adjectivePosition_;
73 }
74
75 bool hasVerbGroup() const
76 {
77 return (verbGroup_ != nullptr);
78 }
79
80 const group& getVerbGroup() const
81 {
82 // Calling code should always call hasVerbGroup first.
83 assert(verbGroup_ != nullptr);
84
85 return *verbGroup_;
86 }
87
88 private:
89
90 static int nextId_;
91
92 const int id_;
93 notion& notion_;
94 lemma& lemma_;
95 const int tagCount_ = 0;
96 const bool hasTagCount_ = false;
97
98 positioning adjectivePosition_ = positioning::undefined;
99 const group* verbGroup_ = nullptr;
100
101 };
102
103 // Serializer
104
105 database& operator<<(database& db, const word& arg);
106
107 };
108};
109
110#endif /* end of include guard: WORD_H_91F99D46 */