summary refs log tree commit diff stats
path: root/generator
diff options
context:
space:
mode:
Diffstat (limited to 'generator')
-rw-r--r--generator/CMakeLists.txt6
-rw-r--r--generator/database.cpp173
-rw-r--r--generator/database.h73
-rw-r--r--generator/field.cpp193
-rw-r--r--generator/field.h76
-rw-r--r--generator/form.cpp53
-rw-r--r--generator/form.h71
-rw-r--r--generator/frame.cpp83
-rw-r--r--generator/frame.h59
-rw-r--r--generator/generator.cpp3145
-rw-r--r--generator/generator.h151
-rw-r--r--generator/group.cpp119
-rw-r--r--generator/group.h80
-rw-r--r--generator/lemma.cpp65
-rw-r--r--generator/lemma.h58
-rw-r--r--generator/main.cpp40
-rw-r--r--generator/notion.cpp85
-rw-r--r--generator/notion.h91
-rw-r--r--generator/part.cpp336
-rw-r--r--generator/part.h114
-rw-r--r--generator/progress.h78
-rw-r--r--generator/pronunciation.cpp87
-rw-r--r--generator/pronunciation.h82
-rw-r--r--generator/role.h35
-rw-r--r--generator/schema.sql352
-rw-r--r--generator/selrestr.cpp288
-rw-r--r--generator/selrestr.h88
-rw-r--r--generator/word.cpp77
-rw-r--r--generator/word.h110
29 files changed, 4018 insertions, 2250 deletions
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 552526d..4f78eb8 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt
@@ -1,12 +1,12 @@
1cmake_minimum_required (VERSION 2.6) 1cmake_minimum_required (VERSION 3.1)
2project (generator) 2project (generator)
3 3
4find_package(PkgConfig) 4find_package(PkgConfig)
5pkg_check_modules(sqlite3 sqlite3 REQUIRED) 5pkg_check_modules(sqlite3 sqlite3 REQUIRED)
6find_package(libxml2 REQUIRED) 6find_package(libxml2 REQUIRED)
7 7
8include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json/src) 8include_directories(${sqlite3_INCLUDE_DIR} ${LIBXML2_INCLUDE_DIR} ../vendor/json)
9add_executable(generator generator.cpp) 9add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp selrestr.cpp database.cpp field.cpp generator.cpp main.cpp)
10set_property(TARGET generator PROPERTY CXX_STANDARD 11) 10set_property(TARGET generator PROPERTY CXX_STANDARD 11)
11set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) 11set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON)
12target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) 12target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES})
diff --git a/generator/database.cpp b/generator/database.cpp new file mode 100644 index 0000000..c7e4cfa --- /dev/null +++ b/generator/database.cpp
@@ -0,0 +1,173 @@
1#include "database.h"
2#include <sqlite3.h>
3#include <cassert>
4#include <fstream>
5#include <stdexcept>
6#include <cstdio>
7#include <sstream>
8#include "field.h"
9#include "../lib/util.h"
10
11namespace verbly {
12 namespace generator {
13
14 sqlite3_error::sqlite3_error(
15 const std::string& what,
16 const std::string& db_err) :
17 what_(what + " (" + db_err + ")"),
18 db_err_(db_err)
19 {
20 }
21
22 const char* sqlite3_error::what() const noexcept
23 {
24 return what_.c_str();
25 }
26
27 const char* sqlite3_error::db_err() const noexcept
28 {
29 return db_err_.c_str();
30 }
31
32 database::database(std::string path)
33 {
34 // If there is already a file at this path, overwrite it.
35 if (std::ifstream(path))
36 {
37 if (std::remove(path.c_str()))
38 {
39 throw std::logic_error("Could not overwrite file at path");
40 }
41 }
42
43 if (sqlite3_open_v2(path.c_str(), &ppdb_, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
44 {
45 // We still have to free the resources allocated. In the event that
46 // allocation failed, ppdb will be null and sqlite3_close_v2 will just
47 // ignore it.
48 std::string errmsg(sqlite3_errmsg(ppdb_));
49 sqlite3_close_v2(ppdb_);
50
51 throw sqlite3_error("Could not create output datafile", errmsg);
52 }
53 }
54
55 database::database(database&& other) : database()
56 {
57 swap(*this, other);
58 }
59
60 database& database::operator=(database&& other)
61 {
62 swap(*this, other);
63
64 return *this;
65 }
66
67 void swap(database& first, database& second)
68 {
69 std::swap(first.ppdb_, second.ppdb_);
70 }
71
72 database::~database()
73 {
74 sqlite3_close_v2(ppdb_);
75 }
76
77 void database::runQuery(std::string query)
78 {
79 // This can only happen when doing bad things with move semantics.
80 assert(ppdb_ != nullptr);
81
82 sqlite3_stmt* ppstmt;
83
84 if (sqlite3_prepare_v2(ppdb_, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
85 {
86 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
87 }
88
89 int result = sqlite3_step(ppstmt);
90 sqlite3_finalize(ppstmt);
91
92 if (result != SQLITE_DONE)
93 {
94 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
95 }
96 }
97
98 void database::insertIntoTable(std::string table, std::list<field> fields)
99 {
100 // This can only happen when doing bad things with move semantics.
101 assert(ppdb_ != nullptr);
102
103 // This shouldn't happen.
104 assert(!fields.empty());
105
106 std::list<std::string> fieldNames;
107 std::list<std::string> qs;
108 for (field& f : fields)
109 {
110 fieldNames.push_back(f.getName());
111 qs.push_back("?");
112 }
113
114 std::ostringstream query;
115 query << "INSERT INTO ";
116 query << table;
117 query << " (";
118 query << implode(std::begin(fieldNames), std::end(fieldNames), ", ");
119 query << ") VALUES (";
120 query << implode(std::begin(qs), std::end(qs), ", ");
121 query << ")";
122
123 std::string query_str = query.str();
124
125 sqlite3_stmt* ppstmt;
126
127 if (sqlite3_prepare_v2(ppdb_, query_str.c_str(), query_str.length(), &ppstmt, NULL) != SQLITE_OK)
128 {
129 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
130 }
131
132 int i = 1;
133 for (field& f : fields)
134 {
135 switch (f.getType())
136 {
137 case field::type::integer:
138 {
139 sqlite3_bind_int(ppstmt, i, f.getInteger());
140
141 break;
142 }
143
144 case field::type::string:
145 {
146 sqlite3_bind_text(ppstmt, i, f.getString().c_str(), f.getString().length(), SQLITE_TRANSIENT);
147
148 break;
149 }
150
151 case field::type::invalid:
152 {
153 // Fields can only be invalid when doing bad things with move semantics.
154 assert(false);
155
156 break;
157 }
158 }
159
160 i++;
161 }
162
163 int result = sqlite3_step(ppstmt);
164 sqlite3_finalize(ppstmt);
165
166 if (result != SQLITE_DONE)
167 {
168 throw sqlite3_error("Error writing to database", sqlite3_errmsg(ppdb_));
169 }
170 }
171
172 };
173};
diff --git a/generator/database.h b/generator/database.h new file mode 100644 index 0000000..15cdff5 --- /dev/null +++ b/generator/database.h
@@ -0,0 +1,73 @@
1#ifndef DATABASE_H_0B0A47D2
2#define DATABASE_H_0B0A47D2
3
4#include <string>
5#include <exception>
6#include <list>
7
8struct sqlite3;
9
10namespace verbly {
11 namespace generator {
12
13 class field;
14
15 class sqlite3_error : public std::exception {
16 public:
17
18 sqlite3_error(const std::string& what, const std::string& db_err);
19
20 const char* what() const noexcept override;
21 const char* db_err() const noexcept;
22
23 private:
24 std::string what_;
25 std::string db_err_;
26
27 };
28
29 class database {
30 public:
31
32 // Constructor
33
34 explicit database(std::string path);
35
36 // Disable copying
37
38 database(const database& other) = delete;
39 database& operator=(const database& other) = delete;
40
41 // Move constructor and move assignment
42
43 database(database&& other);
44 database& operator=(database&& other);
45
46 // Swap
47
48 friend void swap(database& first, database& second);
49
50 // Destructor
51
52 ~database();
53
54 // Actions
55
56 void runQuery(std::string query);
57
58 void insertIntoTable(std::string table, std::list<field> fields);
59
60 private:
61
62 database()
63 {
64 }
65
66 sqlite3* ppdb_ = nullptr;
67
68 };
69
70 };
71};
72
73#endif /* end of include guard: DATABASE_H_0B0A47D2 */
diff --git a/generator/field.cpp b/generator/field.cpp new file mode 100644 index 0000000..84b2f91 --- /dev/null +++ b/generator/field.cpp
@@ -0,0 +1,193 @@
1#include "field.h"
2#include <stdexcept>
3#include <utility>
4
5namespace verbly {
6 namespace generator {
7
8 field::field(const field& other)
9 {
10 type_ = other.type_;
11 name_ = other.name_;
12
13 switch (type_)
14 {
15 case type::integer:
16 {
17 integer_ = other.integer_;
18
19 break;
20 }
21
22 case type::string:
23 {
24 new(&string_) std::string(other.string_);
25
26 break;
27 }
28
29 case type::invalid:
30 {
31 break;
32 }
33 }
34 }
35
36 field::field(field&& other) : field()
37 {
38 swap(*this, other);
39 }
40
41 field& field::operator=(field other)
42 {
43 swap(*this, other);
44
45 return *this;
46 }
47
48 void swap(field& first, field& second)
49 {
50 using type = field::type;
51
52 type tempType = first.type_;
53 std::string tempName = std::move(first.name_);
54 int tempInteger;
55 std::string tempString;
56
57 switch (first.type_)
58 {
59 case type::integer:
60 {
61 tempInteger = first.integer_;
62
63 break;
64 }
65
66 case type::string:
67 {
68 tempString = std::move(tempString);
69
70 break;
71 }
72
73 case type::invalid:
74 {
75 break;
76 }
77 }
78
79 first.~field();
80
81 first.type_ = second.type_;
82 first.name_ = std::move(second.name_);
83
84 switch (second.type_)
85 {
86 case type::integer:
87 {
88 first.integer_ = second.integer_;
89
90 break;
91 }
92
93 case type::string:
94 {
95 new(&first.string_) std::string(std::move(second.string_));
96
97 break;
98 }
99
100 case type::invalid:
101 {
102 break;
103 }
104 }
105
106 second.~field();
107
108 second.type_ = tempType;
109 second.name_ = std::move(tempName);
110
111 switch (tempType)
112 {
113 case type::integer:
114 {
115 second.integer_ = tempInteger;
116
117 break;
118 }
119
120 case type::string:
121 {
122 new(&second.string_) std::string(std::move(tempString));
123
124 break;
125 }
126
127 case type::invalid:
128 {
129 break;
130 }
131 }
132 }
133
134 field::~field()
135 {
136 switch (type_)
137 {
138 case type::string:
139 {
140 using string_type = std::string;
141 string_.~string_type();
142
143 break;
144 }
145
146 case type::integer:
147 case type::invalid:
148 {
149 break;
150 }
151 }
152 }
153
154 field::field(
155 std::string name,
156 int arg) :
157 type_(type::integer),
158 name_(name),
159 integer_(arg)
160 {
161 }
162
163 int field::getInteger() const
164 {
165 if (type_ != type::integer)
166 {
167 throw std::domain_error("field::getInteger called on non-integer field");
168 }
169
170 return integer_;
171 }
172
173 field::field(
174 std::string name,
175 std::string arg) :
176 type_(type::string),
177 name_(name)
178 {
179 new(&string_) std::string(arg);
180 }
181
182 std::string field::getString() const
183 {
184 if (type_ != type::string)
185 {
186 throw std::domain_error("field::getString called on non-string field");
187 }
188
189 return string_;
190 }
191
192 };
193};
diff --git a/generator/field.h b/generator/field.h new file mode 100644 index 0000000..1fbabfc --- /dev/null +++ b/generator/field.h
@@ -0,0 +1,76 @@
1#ifndef BINDING_H_CAE0B18E
2#define BINDING_H_CAE0B18E
3
4#include <string>
5
6namespace verbly {
7 namespace generator {
8
9 class field {
10 public:
11 enum class type {
12 invalid,
13 integer,
14 string
15 };
16
17 // Copy and move constructors
18
19 field(const field& other);
20 field(field&& other);
21
22 // Assignment
23
24 field& operator=(field other);
25
26 // Swap
27
28 friend void swap(field& first, field& second);
29
30 // Destructor
31
32 ~field();
33
34 // Generic accessors
35
36 type getType() const
37 {
38 return type_;
39 }
40
41 std::string getName() const
42 {
43 return name_;
44 }
45
46 // Integer
47
48 field(std::string name, int arg);
49
50 int getInteger() const;
51
52 // String
53
54 field(std::string name, std::string arg);
55
56 std::string getString() const;
57
58 private:
59
60 field()
61 {
62 }
63
64 union {
65 int integer_;
66 std::string string_;
67 };
68
69 type type_ = type::invalid;
70 std::string name_;
71 };
72
73 };
74};
75
76#endif /* end of include guard: BINDING_H_CAE0B18E */
diff --git a/generator/form.cpp b/generator/form.cpp new file mode 100644 index 0000000..6be9d47 --- /dev/null +++ b/generator/form.cpp
@@ -0,0 +1,53 @@
1#include "form.h"
2#include <algorithm>
3#include <list>
4#include "database.h"
5#include "field.h"
6#include "pronunciation.h"
7
8namespace verbly {
9 namespace generator {
10
11 int form::nextId_ = 0;
12
13 form::form(std::string text) :
14 id_(nextId_++),
15 text_(text),
16 complexity_(std::count(std::begin(text), std::end(text), ' ') + 1),
17 proper_(std::any_of(std::begin(text), std::end(text), std::isupper))
18 {
19 }
20
21 void form::addPronunciation(const pronunciation& p)
22 {
23 pronunciations_.insert(&p);
24 }
25
26 database& operator<<(database& db, const form& arg)
27 {
28 // Serialize the form first.
29 {
30 std::list<field> fields;
31 fields.emplace_back("form_id", arg.getId());
32 fields.emplace_back("form", arg.getText());
33 fields.emplace_back("complexity", arg.getComplexity());
34 fields.emplace_back("proper", arg.isProper());
35
36 db.insertIntoTable("forms", std::move(fields));
37 }
38
39 // Then, serialize the form/pronunciation relationship.
40 for (const pronunciation* p : arg.getPronunciations())
41 {
42 std::list<field> fields;
43 fields.emplace_back("form_id", arg.getId());
44 fields.emplace_back("pronunciation_id", p->getId());
45
46 db.insertIntoTable("forms_pronunciations", std::move(fields));
47 }
48
49 return db;
50 }
51
52 };
53};
diff --git a/generator/form.h b/generator/form.h new file mode 100644 index 0000000..5576035 --- /dev/null +++ b/generator/form.h
@@ -0,0 +1,71 @@
1#ifndef FORM_H_7EFBC970
2#define FORM_H_7EFBC970
3
4#include <string>
5#include <set>
6
7namespace verbly {
8 namespace generator {
9
10 class pronunciation;
11 class database;
12
13 class form {
14 public:
15
16 // Constructor
17
18 explicit form(std::string text);
19
20 // Mutators
21
22 void addPronunciation(const pronunciation& p);
23
24 // Accessors
25
26 int getId() const
27 {
28 return id_;
29 }
30
31 std::string getText() const
32 {
33 return text_;
34 }
35
36 int getComplexity() const
37 {
38 return complexity_;
39 }
40
41 bool isProper() const
42 {
43 return proper_;
44 }
45
46 std::set<const pronunciation*> getPronunciations() const
47 {
48 return pronunciations_;
49 }
50
51 private:
52
53 static int nextId_;
54
55 const int id_;
56 const std::string text_;
57 const int complexity_;
58 const bool proper_;
59
60 std::set<const pronunciation*> pronunciations_;
61
62 };
63
64 // Serializer
65
66 database& operator<<(database& db, const form& arg);
67
68 };
69};
70
71#endif /* end of include guard: FORM_H_7EFBC970 */
diff --git a/generator/frame.cpp b/generator/frame.cpp new file mode 100644 index 0000000..9f0653f --- /dev/null +++ b/generator/frame.cpp
@@ -0,0 +1,83 @@
1#include "frame.h"
2#include "database.h"
3#include "field.h"
4
5namespace verbly {
6 namespace generator {
7
8 int frame::nextId_ = 0;
9
10 frame::frame() : id_(nextId_++)
11 {
12 }
13
14 void frame::push_back(part fp)
15 {
16 parts_.push_back(std::move(fp));
17 }
18
19 database& operator<<(database& db, const frame& arg)
20 {
21 std::list<field> fields;
22 fields.emplace_back("frame_id", arg.getId());
23
24 nlohmann::json jsonParts;
25 for (const part& p : arg)
26 {
27 nlohmann::json jsonPart;
28 jsonPart["type"] = static_cast<int>(p.getType());
29
30 switch (p.getType())
31 {
32 case part::type::noun_phrase:
33 {
34 jsonPart["role"] = p.getNounRole();
35 jsonPart["selrestrs"] = p.getNounSelrestrs().toJson();
36 jsonPart["synrestrs"] = p.getNounSynrestrs();
37
38 break;
39 }
40
41 case part::type::preposition:
42 {
43 jsonPart["choices"] = p.getPrepositionChoices();
44 jsonPart["literal"] = p.isPrepositionLiteral();
45
46 break;
47 }
48
49 case part::type::literal:
50 {
51 jsonPart["value"] = p.getLiteralValue();
52
53 break;
54 }
55
56 case part::type::verb:
57 case part::type::adjective:
58 case part::type::adverb:
59 {
60 break;
61 }
62
63 case part::type::invalid:
64 {
65 // Invalid parts should not be serialized.
66 assert(false);
67
68 break;
69 }
70 }
71
72 jsonParts.emplace_back(std::move(jsonPart));
73 }
74
75 fields.emplace_back("data", jsonParts.dump());
76
77 db.insertIntoTable("frames", std::move(fields));
78
79 return db;
80 }
81
82 };
83};
diff --git a/generator/frame.h b/generator/frame.h new file mode 100644 index 0000000..411ce6c --- /dev/null +++ b/generator/frame.h
@@ -0,0 +1,59 @@
1#ifndef FRAME_H_26770FF1
2#define FRAME_H_26770FF1
3
4#include <list>
5#include "part.h"
6
7namespace verbly {
8 namespace generator {
9
10 class database;
11
12 class frame {
13 public:
14
15 // Aliases
16
17 using const_iterator = std::list<part>::const_iterator;
18
19 // Constructor
20
21 frame();
22
23 // Mutators
24
25 void push_back(part fp);
26
27 // Accessors
28
29 int getId() const
30 {
31 return id_;
32 }
33
34 const_iterator begin() const
35 {
36 return std::begin(parts_);
37 }
38
39 const_iterator end() const
40 {
41 return std::end(parts_);
42 }
43
44 private:
45
46 static int nextId_;
47
48 const int id_;
49
50 std::list<part> parts_;
51
52 };
53
54 database& operator<<(database& db, const frame& arg);
55
56 };
57};
58
59#endif /* end of include guard: FRAME_H_26770FF1 */
diff --git a/generator/generator.cpp b/generator/generator.cpp index 6a16467..d88cb31 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,2320 +1,1477 @@
1#include <libxml/parser.h> 1#include "generator.h"
2#include <cassert>
3#include <stdexcept>
2#include <iostream> 4#include <iostream>
5#include <regex>
3#include <dirent.h> 6#include <dirent.h>
4#include <set>
5#include <map>
6#include <string>
7#include <vector>
8#include <fstream> 7#include <fstream>
9#include <sqlite3.h> 8#include "enums.h"
10#include <sstream>
11#include <regex>
12#include <list>
13#include <algorithm>
14#include <json.hpp>
15#include "progress.h" 9#include "progress.h"
10#include "selrestr.h"
11#include "role.h"
12#include "part.h"
13#include "field.h"
16#include "../lib/util.h" 14#include "../lib/util.h"
17 15
18using json = nlohmann::json; 16namespace verbly {
19 17 namespace generator {
20struct verb_t {
21 std::string infinitive;
22 std::string past_tense;
23 std::string past_participle;
24 std::string ing_form;
25 std::string s_form;
26 int id;
27};
28
29struct adjective_t {
30 std::string base;
31 std::string comparative;
32 std::string superlative;
33};
34
35struct noun_t {
36 std::string singular;
37 std::string plural;
38};
39
40struct selrestr_t {
41 enum class type_t {
42 singleton,
43 andlogic,
44 orlogic,
45 empty
46 };
47 type_t type;
48 std::string restriction;
49 bool pos;
50 std::list<selrestr_t> subordinates;
51};
52
53struct framepart_t {
54 enum class type_t {
55 np,
56 v,
57 pp,
58 adj,
59 adv,
60 lex
61 };
62 type_t type;
63 std::string role;
64 selrestr_t selrestrs;
65 std::set<std::string> preprestrs;
66 std::set<std::string> synrestrs;
67 std::list<std::string> choices;
68 std::string lexval;
69};
70
71struct group_t {
72 std::string id;
73 std::string parent;
74 std::set<std::string> members;
75 std::map<std::string, selrestr_t> roles;
76 std::list<std::list<framepart_t>> frames;
77};
78
79struct pronunciation_t {
80 std::string phonemes;
81 std::string prerhyme;
82 std::string rhyme;
83 int syllables = 0;
84 std::string stress;
85
86 bool operator<(const pronunciation_t& other) const
87 {
88 return phonemes < other.phonemes;
89 }
90};
91
92std::map<std::string, group_t> groups;
93std::map<std::string, verb_t> verbs;
94std::map<std::string, adjective_t> adjectives;
95std::map<std::string, noun_t> nouns;
96std::map<int, std::map<int, int>> wn;
97std::map<int, int> images;
98std::map<std::string, std::set<pronunciation_t>> pronunciations;
99
100void print_usage()
101{
102 std::cout << "Verbly Datafile Generator" << std::endl;
103 std::cout << "-------------------------" << std::endl;
104 std::cout << "Requires exactly six arguments." << std::endl;
105 std::cout << "1. The path to a VerbNet data directory." << std::endl;
106 std::cout << "2. The path to an AGID infl.txt file." << std::endl;
107 std::cout << "3. The path to a WordNet prolog data directory." << std::endl;
108 std::cout << "4. The path to a CMUDICT pronunciation file." << std::endl;
109 std::cout << "5. The path to an ImageNet urls.txt file." << std::endl;
110 std::cout << "6. Datafile output path." << std::endl;
111
112 exit(1);
113}
114
115void db_error(sqlite3* ppdb, std::string query)
116{
117 std::cout << "Error writing to output database: " << sqlite3_errmsg(ppdb) << std::endl;
118 std::cout << query << std::endl;
119 sqlite3_close_v2(ppdb);
120 print_usage();
121}
122
123json export_selrestrs(selrestr_t r)
124{
125 if (r.type == selrestr_t::type_t::empty)
126 {
127 return {};
128 } else if (r.type == selrestr_t::type_t::singleton)
129 {
130 json result;
131 result["type"] = r.restriction;
132 result["pos"] = r.pos;
133 return result;
134 } else {
135 json result;
136 if (r.type == selrestr_t::type_t::andlogic)
137 {
138 result["logic"] = "and";
139 } else {
140 result["logic"] = "or";
141 }
142
143 std::list<json> outlist;
144 std::transform(std::begin(r.subordinates), std::end(r.subordinates), std::back_inserter(outlist), &export_selrestrs);
145 result["children"] = outlist;
146 18
147 return result; 19 generator::generator(
148 } 20 std::string verbNetPath,
149} 21 std::string agidPath,
150 22 std::string wordNetPath,
151selrestr_t parse_selrestrs(xmlNodePtr top, std::string filename) 23 std::string cmudictPath,
152{ 24 std::string imageNetPath,
153 selrestr_t r; 25 std::string outputPath) :
154 xmlChar* key; 26 verbNetPath_(verbNetPath),
155 27 agidPath_(agidPath),
156 if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTRS")) 28 wordNetPath_(wordNetPath),
157 { 29 cmudictPath_(cmudictPath),
158 if (xmlChildElementCount(top) == 0) 30 imageNetPath_(imageNetPath),
31 db_(outputPath)
159 { 32 {
160 r.type = selrestr_t::type_t::empty; 33 // Ensure VerbNet directory exists
161 } else if (xmlChildElementCount(top) == 1) 34 DIR* dir;
162 { 35 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
163 r = parse_selrestrs(xmlFirstElementChild(top), filename);
164 } else {
165 r.type = selrestr_t::type_t::andlogic;
166
167 if (xmlHasProp(top, (const xmlChar*) "logic"))
168 { 36 {
169 key = xmlGetProp(top, (const xmlChar*) "logic"); 37 throw std::invalid_argument("Invalid VerbNet data directory");
170 if (!xmlStrcmp(key, (const xmlChar*) "or"))
171 {
172 r.type = selrestr_t::type_t::orlogic;
173 }
174 xmlFree(key);
175 } 38 }
176 39
177 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next) 40 closedir(dir);
41
42 // Ensure AGID infl.txt exists
43 if (!std::ifstream(agidPath_))
178 { 44 {
179 if (!xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTRS") || !xmlStrcmp(selrestr->name, (const xmlChar*) "SELRESTR")) 45 throw std::invalid_argument("AGID infl.txt file not found");
180 {
181 r.subordinates.push_back(parse_selrestrs(selrestr, filename));
182 }
183 } 46 }
184 } 47
185 } else if (!xmlStrcmp(top->name, (const xmlChar*) "SELRESTR")) 48 // Add directory separator to WordNet path
186 { 49 if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\'))
187 r.type = selrestr_t::type_t::singleton;
188
189 key = xmlGetProp(top, (xmlChar*) "Value");
190 r.pos = (std::string((const char*)key) == "+");
191 xmlFree(key);
192
193 key = xmlGetProp(top, (xmlChar*) "type");
194 r.restriction = (const char*) key;
195 xmlFree(key);
196 } else {
197 // Invalid
198 std::cout << "Bad VerbNet file format: " << filename << std::endl;
199 print_usage();
200 }
201
202 return r;
203}
204
205group_t& parse_group(xmlNodePtr top, std::string filename)
206{
207 xmlChar* key = xmlGetProp(top, (xmlChar*) "ID");
208 if (key == 0)
209 {
210 std::cout << "Bad VerbNet file format: " << filename << std::endl;
211 print_usage();
212 }
213 std::string vnid = (const char*)key;
214 vnid = vnid.substr(vnid.find_first_of("-")+1);
215 xmlFree(key);
216
217 group_t g;
218 g.id = vnid;
219
220 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
221 {
222 if (!xmlStrcmp(node->name, (const xmlChar*) "SUBCLASSES"))
223 {
224 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
225 { 50 {
226 if (!xmlStrcmp(subclass->name, (const xmlChar*) "VNSUBCLASS")) 51 wordNetPath_ += '/';
227 {
228 auto& sg = parse_group(subclass, filename);
229 sg.parent = vnid;
230
231 for (auto member : sg.members)
232 {
233 g.members.insert(member);
234 }
235
236 // The schema requires that subclasses appear after role definitions, so we can do this now
237 for (auto role : g.roles)
238 {
239 if (sg.roles.count(role.first) == 0)
240 {
241 sg.roles[role.first] = role.second;
242 }
243 }
244 }
245 } 52 }
246 } else if (!xmlStrcmp(node->name, (const xmlChar*) "MEMBERS")) 53
247 { 54 // Ensure WordNet tables exist
248 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next) 55 for (std::string table : {
56 "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"
57 })
249 { 58 {
250 if (!xmlStrcmp(member->name, (const xmlChar*) "MEMBER")) 59 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl"))
251 { 60 {
252 key = xmlGetProp(member, (xmlChar*) "name"); 61 throw std::invalid_argument("WordNet " + table + " table not found");
253 g.members.insert((const char*)key);
254 xmlFree(key);
255 } 62 }
256 } 63 }
257 } else if (!xmlStrcmp(node->name, (const xmlChar*) "THEMROLES")) 64
258 { 65 // Ensure CMUDICT file exists
259 for (xmlNodePtr role = node->xmlChildrenNode; role != nullptr; role = role->next) 66 if (!std::ifstream(cmudictPath_))
260 { 67 {
261 if (!xmlStrcmp(role->name, (const xmlChar*) "THEMROLE")) 68 throw std::invalid_argument("CMUDICT file not found");
262 {
263 selrestr_t r;
264 r.type = selrestr_t::type_t::empty;
265
266 key = xmlGetProp(role, (const xmlChar*) "type");
267 std::string type = (const char*)key;
268 xmlFree(key);
269
270 for (xmlNodePtr rolenode = role->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
271 {
272 if (!xmlStrcmp(rolenode->name, (const xmlChar*) "SELRESTRS"))
273 {
274 r = parse_selrestrs(rolenode, filename);
275 }
276 }
277
278 g.roles[type] = r;
279 }
280 } 69 }
281 } else if (!xmlStrcmp(node->name, (const xmlChar*) "FRAMES")) 70
282 { 71 // Ensure ImageNet urls.txt exists
283 for (xmlNodePtr frame = node->xmlChildrenNode; frame != nullptr; frame = frame->next) 72 if (!std::ifstream(imageNetPath_))
284 { 73 {
285 if (!xmlStrcmp(frame->name, (const xmlChar*) "FRAME")) 74 throw std::invalid_argument("ImageNet urls.txt file not found");
286 {
287 std::list<framepart_t> f;
288
289 for (xmlNodePtr framenode = frame->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
290 {
291 if (!xmlStrcmp(framenode->name, (const xmlChar*) "SYNTAX"))
292 {
293 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
294 {
295 framepart_t fp;
296
297 if (!xmlStrcmp(syntaxnode->name, (const xmlChar*) "NP"))
298 {
299 fp.type = framepart_t::type_t::np;
300
301 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
302 fp.role = (const char*)key;
303 xmlFree(key);
304
305 fp.selrestrs.type = selrestr_t::type_t::empty;
306
307 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
308 {
309 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SYNRESTRS"))
310 {
311 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
312 {
313 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SYNRESTR"))
314 {
315 key = xmlGetProp(synrestr, (xmlChar*) "type");
316 fp.synrestrs.insert(std::string((const char*)key));
317 xmlFree(key);
318 }
319 }
320 }
321
322 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
323 {
324 fp.selrestrs = parse_selrestrs(npnode, filename);
325 }
326 }
327 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "VERB"))
328 {
329 fp.type = framepart_t::type_t::v;
330 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "PREP"))
331 {
332 fp.type = framepart_t::type_t::pp;
333
334 if (xmlHasProp(syntaxnode, (xmlChar*) "value"))
335 {
336 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
337 std::string choices = (const char*)key;
338 xmlFree(key);
339
340 fp.choices = verbly::split<std::list<std::string>>(choices, " ");
341 }
342
343 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
344 {
345 if (!xmlStrcmp(npnode->name, (const xmlChar*) "SELRESTRS"))
346 {
347 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
348 {
349 if (!xmlStrcmp(synrestr->name, (const xmlChar*) "SELRESTR"))
350 {
351 key = xmlGetProp(synrestr, (xmlChar*) "type");
352 fp.preprestrs.insert(std::string((const char*)key));
353 xmlFree(key);
354 }
355 }
356 }
357 }
358 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADJ"))
359 {
360 fp.type = framepart_t::type_t::adj;
361 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "ADV"))
362 {
363 fp.type = framepart_t::type_t::adv;
364 } else if (!xmlStrcmp(syntaxnode->name, (xmlChar*) "LEX"))
365 {
366 fp.type = framepart_t::type_t::lex;
367
368 key = xmlGetProp(syntaxnode, (xmlChar*) "value");
369 fp.lexval = (const char*)key;
370 xmlFree(key);
371 } else {
372 continue;
373 }
374
375 f.push_back(fp);
376 }
377
378 g.frames.push_back(f);
379 }
380 }
381 }
382 } 75 }
383 } 76 }
384 }
385
386 groups[vnid] = g;
387
388 return groups[vnid];
389}
390
391int main(int argc, char** argv)
392{
393 if (argc != 7)
394 {
395 print_usage();
396 }
397
398 // VerbNet data
399 std::cout << "Reading verb frames..." << std::endl;
400
401 DIR* dir;
402 if ((dir = opendir(argv[1])) == nullptr)
403 {
404 std::cout << "Invalid VerbNet data directory." << std::endl;
405
406 print_usage();
407 }
408
409 struct dirent* ent;
410 while ((ent = readdir(dir)) != nullptr)
411 {
412 std::string filename(argv[1]);
413 if (filename.back() != '/')
414 {
415 filename += '/';
416 }
417 77
418 filename += ent->d_name; 78 void generator::run()
419 //std::cout << ent->d_name << std::endl;
420
421 if (filename.rfind(".xml") != filename.size() - 4)
422 {
423 continue;
424 }
425
426 xmlDocPtr doc = xmlParseFile(filename.c_str());
427 if (doc == nullptr)
428 {
429 std::cout << "Error opening " << filename << std::endl;
430 print_usage();
431 }
432
433 xmlNodePtr top = xmlDocGetRootElement(doc);
434 if ((top == nullptr) || (xmlStrcmp(top->name, (xmlChar*) "VNCLASS")))
435 {
436 std::cout << "Bad VerbNet file format: " << filename << std::endl;
437 print_usage();
438 }
439
440 parse_group(top, filename);
441 }
442
443 closedir(dir);
444
445 // Get verbs from AGID
446 std::cout << "Reading inflections..." << std::endl;
447
448 std::ifstream agidfile(argv[2]);
449 if (!agidfile.is_open())
450 {
451 std::cout << "Could not open AGID file: " << argv[2] << std::endl;
452 print_usage();
453 }
454
455 for (;;)
456 {
457 std::string line;
458 if (!getline(agidfile, line))
459 {
460 break;
461 }
462
463 if (line.back() == '\r')
464 { 79 {
465 line.pop_back(); 80 // Create notions, words, lemmas, and forms from WordNet synsets
466 } 81 readWordNetSynsets();
467 82
468 int divider = line.find_first_of(" "); 83 // Reads adjective positioning WordNet data
469 std::string word = line.substr(0, divider); 84 readAdjectivePositioning();
470 line = line.substr(divider+1); 85
471 char type = line[0]; 86 // Counts the number of URLs ImageNet has per notion
472 87 readImageNetUrls();
473 if (line[1] == '?') 88
474 { 89 // Creates a word by WordNet sense key lookup table
475 line.erase(0, 4); 90 readWordNetSenseKeys();
476 } else { 91
477 line.erase(0, 3); 92 // Creates groups and frames from VerbNet data
478 } 93 readVerbNet();
479 94
480 std::vector<std::string> forms; 95 // Creates forms and inflections from AGID. To reduce the amount of forms
481 while (!line.empty()) 96 // created, we do this after most lemmas that need inflecting have been
482 { 97 // created through other means, and then only generate forms for
483 std::string inflection; 98 // inflections of already-existing lemmas. The exception to this regards
484 if ((divider = line.find(" | ")) != std::string::npos) 99 // verb lemmas. If a verb lemma in AGID either does not exist yet, or does
485 { 100 // exist but is not related to any words that are related to verb notions,
486 inflection = line.substr(0, divider); 101 // then a notion and a word is generated and the form generation proceeds
487 line = line.substr(divider + 3); 102 // as usual.
488 } else { 103 readAgidInflections();
489 inflection = line; 104
490 line = ""; 105 // Reads in prepositions and the is_a relationship
491 } 106 readPrepositions();
492 107
493 if ((divider = inflection.find_first_of(",?")) != std::string::npos) 108 // Creates pronunciations from CMUDICT. To reduce the amount of
494 { 109 // pronunciations created, we do this after all forms have been created,
495 inflection = inflection.substr(0, divider); 110 // and then only generate pronunciations for already-exisiting forms.
496 } 111 readCmudictPronunciations();
497 112
498 forms.push_back(inflection); 113 // Writes the database schema
114 writeSchema();
115
116 // Dumps data to the database
117 dumpObjects();
118
119 // Populates the antonymy relationship from WordNet
120 readWordNetAntonymy();
121
122 // Populates the variation relationship from WordNet
123 readWordNetVariation();
124
125 // Populates the usage, topicality, and regionality relationships from
126 // WordNet
127 readWordNetClasses();
128
129 // Populates the causality relationship from WordNet
130 readWordNetCausality();
131
132 // Populates the entailment relationship from WordNet
133 readWordNetEntailment();
134
135 // Populates the hypernymy relationship from WordNet
136 readWordNetHypernymy();
137
138 // Populates the instantiation relationship from WordNet
139 readWordNetInstantiation();
140
141 // Populates the member meronymy relationship from WordNet
142 readWordNetMemberMeronymy();
143
144 // Populates the part meronymy relationship from WordNet
145 readWordNetPartMeronymy();
146
147 // Populates the substance meronymy relationship from WordNet
148 readWordNetSubstanceMeronymy();
149
150 // Populates the pertainymy and mannernymy relationships from WordNet
151 readWordNetPertainymy();
152
153 // Populates the specification relationship from WordNet
154 readWordNetSpecification();
155
156 // Populates the adjective similarity relationship from WordNet
157 readWordNetSimilarity();
158
159
160
161
162
163
164
165
499 } 166 }
500 167
501 switch (type) 168 void generator::readWordNetSynsets()
502 { 169 {
503 case 'V': 170 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
171 progress ppgs("Reading synsets from WordNet...", lines.size());
172
173 for (std::string line : lines)
504 { 174 {
505 verb_t v; 175 ppgs.update();
506 v.infinitive = word; 176
507 if (forms.size() == 4) 177 std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
508 { 178 std::smatch relation_data;
509 v.past_tense = forms[0]; 179 if (!std::regex_search(line, relation_data, relation))
510 v.past_participle = forms[1]; 180 {
511 v.ing_form = forms[2]; 181 continue;
512 v.s_form = forms[3];
513 } else if (forms.size() == 3)
514 {
515 v.past_tense = forms[0];
516 v.past_participle = forms[0];
517 v.ing_form = forms[1];
518 v.s_form = forms[2];
519 } else if (forms.size() == 8)
520 {
521 // As of AGID 2014.08.11, this is only "to be"
522 v.past_tense = forms[0];
523 v.past_participle = forms[2];
524 v.ing_form = forms[3];
525 v.s_form = forms[4];
526 } else {
527 // Words that don't fit the cases above as of AGID 2014.08.11:
528 // - may and shall do not conjugate the way we want them to
529 // - methinks only has a past tense and is an outlier
530 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
531 std::cout << "Ignoring verb \"" << word << "\" due to non-standard number of forms." << std::endl;
532 } 182 }
533 183
534 verbs[word] = v; 184 int synset_id = std::stoi(relation_data[1]);
535 185 int wnum = std::stoi(relation_data[2]);
536 break; 186 std::string text = relation_data[3];
537 } 187 int tag_count = std::stoi(relation_data[4]);
538 188 size_t word_it;
539 case 'A': 189 while ((word_it = text.find("''")) != std::string::npos)
540 {
541 adjective_t adj;
542 adj.base = word;
543 if (forms.size() == 2)
544 { 190 {
545 adj.comparative = forms[0]; 191 text.erase(word_it, 1);
546 adj.superlative = forms[1];
547 } else {
548 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
549 std::cout << "Ignoring adjective/adverb \"" << word << "\" due to non-standard number of forms." << std::endl;
550 } 192 }
551 193
552 adjectives[word] = adj; 194 // The WordNet data does contain duplicates, so we need to check that we
553 195 // haven't already created this word.
554 break; 196 std::pair<int, int> lookup(synset_id, wnum);
555 } 197 if (!wordByWnidAndWnum_.count(lookup))
556
557 case 'N':
558 {
559 noun_t n;
560 n.singular = word;
561 if (forms.size() == 1)
562 { 198 {
563 n.plural = forms[0]; 199 notion& synset = lookupOrCreateNotion(synset_id);
564 } else { 200 lemma& lex = lookupOrCreateLemma(text);
565 // As of AGID 2014.08.11, this is non-existent. 201 word& entry = createWord(synset, lex, tag_count);
566 std::cout << "Ignoring noun \"" << word << "\" due to non-standard number of forms." << std::endl; 202
203 wordByWnidAndWnum_[lookup] = &entry;
567 } 204 }
568
569 nouns[word] = n;
570
571 break;
572 } 205 }
573 } 206 }
574 }
575
576 // Pronounciations
577 std::cout << "Reading pronunciations..." << std::endl;
578
579 std::ifstream pronfile(argv[4]);
580 if (!pronfile.is_open())
581 {
582 std::cout << "Could not open CMUDICT file: " << argv[4] << std::endl;
583 print_usage();
584 }
585
586 for (;;)
587 {
588 std::string line;
589 if (!getline(pronfile, line))
590 {
591 break;
592 }
593
594 if (line.back() == '\r')
595 {
596 line.pop_back();
597 }
598 207
599 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); 208 void generator::readAdjectivePositioning()
600 std::smatch phoneme_data;
601 if (std::regex_search(line, phoneme_data, phoneme))
602 { 209 {
603 std::string canonical(phoneme_data[1]); 210 std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
604 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower); 211 progress ppgs("Reading adjective positionings from WordNet...", lines.size());
605
606 std::string phonemes = phoneme_data[2];
607 auto phoneme_set = verbly::split<std::list<std::string>>(phonemes, " ");
608 auto phemstrt = std::find_if(std::begin(phoneme_set), std::end(phoneme_set), [] (std::string phoneme) {
609 return phoneme.find("1") != std::string::npos;
610 });
611 212
612 pronunciation_t p; 213 for (std::string line : lines)
613 p.phonemes = phonemes;
614
615 // Rhyme detection
616 if (phemstrt != std::end(phoneme_set))
617 { 214 {
618 std::stringstream rhymer; 215 ppgs.update();
619 for (auto it = phemstrt; it != std::end(phoneme_set); it++)
620 {
621 std::string naked;
622 std::remove_copy_if(std::begin(*it), std::end(*it), std::back_inserter(naked), [] (char ch) {
623 return isdigit(ch);
624 });
625
626 if (it != phemstrt)
627 {
628 rhymer << " ";
629 }
630
631 rhymer << naked;
632 }
633 216
634 p.rhyme = rhymer.str(); 217 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
635 218 std::smatch relation_data;
636 if (phemstrt != std::begin(phoneme_set)) 219 if (!std::regex_search(line, relation_data, relation))
637 { 220 {
638 phemstrt--; 221 continue;
639 p.prerhyme = *phemstrt;
640 } else {
641 p.prerhyme = "";
642 } 222 }
643 } else {
644 p.prerhyme = "";
645 p.rhyme = "";
646 }
647 223
648 // Syllable/stress 224 int synset_id = stoi(relation_data[1]);
649 for (auto phm : phoneme_set) 225 int wnum = stoi(relation_data[2]);
650 { 226 std::string adjpos_str = relation_data[3];
651 if (isdigit(phm.back()))
652 {
653 // It's a vowel!
654 p.syllables++;
655 227
656 if (phm.back() == '1') 228 std::pair<int, int> lookup(synset_id, wnum);
229 if (wordByWnidAndWnum_.count(lookup))
230 {
231 word& adj = *wordByWnidAndWnum_.at(lookup);
232
233 if (adjpos_str == "p")
234 {
235 adj.setAdjectivePosition(positioning::predicate);
236 } else if (adjpos_str == "a")
237 {
238 adj.setAdjectivePosition(positioning::attributive);
239 } else if (adjpos_str == "i")
657 { 240 {
658 p.stress.push_back('1'); 241 adj.setAdjectivePosition(positioning::postnominal);
659 } else { 242 } else {
660 p.stress.push_back('0'); 243 // Can't happen because of how we specified the regex.
244 assert(false);
661 } 245 }
662 } 246 }
663 } 247 }
664
665 pronunciations[canonical].insert(p);
666 }
667 }
668
669 // Images
670 std::cout << "Reading images..." << std::endl;
671
672 std::ifstream imagefile(argv[5]);
673 if (!imagefile.is_open())
674 {
675 std::cout << "Could not open ImageNet file: " << argv[5] << std::endl;
676 print_usage();
677 }
678
679 for (;;)
680 {
681 std::string line;
682 if (!getline(imagefile, line))
683 {
684 break;
685 }
686
687 if (line.back() == '\r')
688 {
689 line.pop_back();
690 }
691
692 std::string wnid_s = line.substr(1, 8);
693 int wnid = stoi(wnid_s) + 100000000;
694 images[wnid]++;
695 }
696
697 imagefile.close();
698
699 // Start writing output
700 std::cout << "Writing schema..." << std::endl;
701
702 sqlite3* ppdb;
703 if (sqlite3_open_v2(argv[6], &ppdb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, NULL) != SQLITE_OK)
704 {
705 std::cout << "Error opening output datafile: " << sqlite3_errmsg(ppdb) << std::endl;
706 print_usage();
707 }
708
709 std::ifstream schemafile("schema.sql");
710 if (!schemafile.is_open())
711 {
712 std::cout << "Could not find schema file" << std::endl;
713 print_usage();
714 }
715
716 std::stringstream schemabuilder;
717 for (;;)
718 {
719 std::string line;
720 if (!getline(schemafile, line))
721 {
722 break;
723 }
724
725 if (line.back() == '\r')
726 {
727 line.pop_back();
728 }
729
730 schemabuilder << line << std::endl;
731 }
732
733 std::string schema = schemabuilder.str();
734 while (!schema.empty())
735 {
736 std::string query;
737 int divider = schema.find(";");
738 if (divider != std::string::npos)
739 {
740 query = schema.substr(0, divider+1);
741 schema = schema.substr(divider+2);
742 } else {
743 break;
744 } 248 }
745 249
746 sqlite3_stmt* schmstmt; 250 void generator::readImageNetUrls()
747 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &schmstmt, NULL) != SQLITE_OK)
748 { 251 {
749 db_error(ppdb, query); 252 // The ImageNet datafile is so large that it is unreasonable and
750 } 253 // unnecessary to read it into memory; instead, we will parse each line as
751 254 // we read it. This has the caveat that we cannot display a progress bar.
752 if (sqlite3_step(schmstmt) != SQLITE_DONE) 255 std::cout << "Reading image counts from ImageNet..." << std::endl;
753 {
754 db_error(ppdb, query);
755 }
756
757 sqlite3_finalize(schmstmt);
758 }
759
760 std::cout << "Writing prepositions..." << std::endl;
761 std::ifstream prepfile("prepositions.txt");
762 if (!prepfile.is_open())
763 {
764 std::cout << "Could not find prepositions file" << std::endl;
765 print_usage();
766 }
767
768 for (;;)
769 {
770 std::string line;
771 if (!getline(prepfile, line))
772 {
773 break;
774 }
775
776 if (line.back() == '\r')
777 {
778 line.pop_back();
779 }
780
781 std::regex relation("^([^:]+): (.+)");
782 std::smatch relation_data;
783 std::regex_search(line, relation_data, relation);
784 std::string prep = relation_data[1];
785 std::list<std::string> groups = verbly::split<std::list<std::string>>(relation_data[2], ", ");
786
787 std::string query("INSERT INTO prepositions (form) VALUES (?)");
788 sqlite3_stmt* ppstmt;
789
790 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
791 {
792 db_error(ppdb, query);
793 }
794
795 sqlite3_bind_text(ppstmt, 1, prep.c_str(), prep.length(), SQLITE_TRANSIENT);
796
797 if (sqlite3_step(ppstmt) != SQLITE_DONE)
798 {
799 db_error(ppdb, query);
800 }
801
802 sqlite3_finalize(ppstmt);
803
804 query = "SELECT last_insert_rowid()";
805 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
806 {
807 db_error(ppdb, query);
808 }
809
810 if (sqlite3_step(ppstmt) != SQLITE_ROW)
811 {
812 db_error(ppdb, query);
813 }
814
815 int rowid = sqlite3_column_int(ppstmt, 0);
816 sqlite3_finalize(ppstmt);
817
818 for (auto group : groups)
819 {
820 query = "INSERT INTO preposition_groups (preposition_id, groupname) VALUES (?, ?)";
821 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
822 {
823 db_error(ppdb, query);
824 }
825 256
826 sqlite3_bind_int(ppstmt, 1, rowid); 257 std::ifstream file(imageNetPath_);
827 sqlite3_bind_text(ppstmt, 2, group.c_str(), group.length(), SQLITE_TRANSIENT); 258 if (!file)
828
829 if (sqlite3_step(ppstmt) != SQLITE_DONE)
830 { 259 {
831 db_error(ppdb, query); 260 throw std::invalid_argument("Could not find file " + imageNetPath_);
832 } 261 }
833
834 sqlite3_finalize(ppstmt);
835 }
836 }
837
838 262
839 { 263 std::string line;
840 progress ppgs("Writing verbs...", verbs.size()); 264 while (std::getline(file, line))
841 for (auto& mapping : verbs)
842 {
843 sqlite3_stmt* ppstmt;
844 std::string query("INSERT INTO verbs (infinitive, past_tense, past_participle, ing_form, s_form) VALUES (?, ?, ?, ?, ?)");
845 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
846 {
847 db_error(ppdb, query);
848 }
849
850 sqlite3_bind_text(ppstmt, 1, mapping.second.infinitive.c_str(), mapping.second.infinitive.length(), SQLITE_TRANSIENT);
851 sqlite3_bind_text(ppstmt, 2, mapping.second.past_tense.c_str(), mapping.second.past_tense.length(), SQLITE_TRANSIENT);
852 sqlite3_bind_text(ppstmt, 3, mapping.second.past_participle.c_str(), mapping.second.past_participle.length(), SQLITE_TRANSIENT);
853 sqlite3_bind_text(ppstmt, 4, mapping.second.ing_form.c_str(), mapping.second.ing_form.length(), SQLITE_TRANSIENT);
854 sqlite3_bind_text(ppstmt, 5, mapping.second.s_form.c_str(), mapping.second.s_form.length(), SQLITE_TRANSIENT);
855
856 if (sqlite3_step(ppstmt) != SQLITE_DONE)
857 {
858 db_error(ppdb, query);
859 }
860
861 sqlite3_finalize(ppstmt);
862
863 std::string canonical(mapping.second.infinitive);
864 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
865 if (pronunciations.count(canonical) == 1)
866 { 265 {
867 query = "SELECT last_insert_rowid()"; 266 if (line.back() == '\r')
868 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
869 { 267 {
870 db_error(ppdb, query); 268 line.pop_back();
871 } 269 }
872 270
873 if (sqlite3_step(ppstmt) != SQLITE_ROW) 271 std::string wnid_s = line.substr(1, 8);
272 int wnid = stoi(wnid_s) + 100000000;
273 if (notionByWnid_.count(wnid))
874 { 274 {
875 db_error(ppdb, query); 275 // We know that this notion has a wnid and is a noun.
876 } 276 notionByWnid_.at(wnid)->incrementNumOfImages();
877
878 int rowid = sqlite3_column_int(ppstmt, 0);
879
880 sqlite3_finalize(ppstmt);
881
882 mapping.second.id = rowid;
883
884 for (auto pronunciation : pronunciations[canonical])
885 {
886 if (!pronunciation.rhyme.empty())
887 {
888 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
889 } else {
890 query = "INSERT INTO verb_pronunciations (verb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
891 }
892
893 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
894 {
895 db_error(ppdb, query);
896 }
897
898 sqlite3_bind_int(ppstmt, 1, rowid);
899 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
900 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
901 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
902
903 if (!pronunciation.rhyme.empty())
904 {
905 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
906 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
907 }
908
909 if (sqlite3_step(ppstmt) != SQLITE_DONE)
910 {
911 db_error(ppdb, query);
912 }
913
914 sqlite3_finalize(ppstmt);
915 } 277 }
916 } 278 }
917
918 ppgs.update();
919 } 279 }
920 } 280
921 281 void generator::readWordNetSenseKeys()
922 {
923 progress ppgs("Writing verb frames...", groups.size());
924 for (auto& mapping : groups)
925 { 282 {
926 std::list<json> roledatal; 283 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
927 std::transform(std::begin(mapping.second.roles), std::end(mapping.second.roles), std::back_inserter(roledatal), [] (std::pair<std::string, selrestr_t> r) { 284 progress ppgs("Reading sense keys from WordNet...", lines.size());
928 json role;
929 role["type"] = r.first;
930 role["selrestrs"] = export_selrestrs(r.second);
931
932 return role;
933 });
934
935 json roledata(roledatal);
936 std::string rdm = roledata.dump();
937
938 sqlite3_stmt* ppstmt;
939 std::string query("INSERT INTO groups (data) VALUES (?)");
940 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
941 {
942 db_error(ppdb, query);
943 }
944
945 sqlite3_bind_blob(ppstmt, 1, rdm.c_str(), rdm.size(), SQLITE_TRANSIENT);
946
947 if (sqlite3_step(ppstmt) != SQLITE_DONE)
948 {
949 db_error(ppdb, query);
950 }
951 285
952 sqlite3_finalize(ppstmt); 286 for (std::string line : lines)
953
954 query = "SELECT last_insert_rowid()";
955 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
956 {
957 db_error(ppdb, query);
958 }
959
960 if (sqlite3_step(ppstmt) != SQLITE_ROW)
961 {
962 db_error(ppdb, query);
963 }
964
965 int gid = sqlite3_column_int(ppstmt, 0);
966 sqlite3_finalize(ppstmt);
967
968 for (auto frame : mapping.second.frames)
969 { 287 {
970 std::list<json> fdatap; 288 ppgs.update();
971 std::transform(std::begin(frame), std::end(frame), std::back_inserter(fdatap), [] (framepart_t& fp) {
972 json part;
973
974 switch (fp.type)
975 {
976 case framepart_t::type_t::np:
977 {
978 part["type"] = "np";
979 part["role"] = fp.role;
980 part["selrestrs"] = export_selrestrs(fp.selrestrs);
981 part["synrestrs"] = fp.synrestrs;
982
983 break;
984 }
985
986 case framepart_t::type_t::pp:
987 {
988 part["type"] = "pp";
989 part["values"] = fp.choices;
990 part["preprestrs"] = fp.preprestrs;
991
992 break;
993 }
994
995 case framepart_t::type_t::v:
996 {
997 part["type"] = "v";
998
999 break;
1000 }
1001
1002 case framepart_t::type_t::adj:
1003 {
1004 part["type"] = "adj";
1005
1006 break;
1007 }
1008
1009 case framepart_t::type_t::adv:
1010 {
1011 part["type"] = "adv";
1012
1013 break;
1014 }
1015
1016 case framepart_t::type_t::lex:
1017 {
1018 part["type"] = "lex";
1019 part["value"] = fp.lexval;
1020
1021 break;
1022 }
1023 }
1024
1025 return part;
1026 });
1027
1028 json fdata(fdatap);
1029 std::string marshall = fdata.dump();
1030
1031 query = "INSERT INTO frames (group_id, data) VALUES (?, ?)";
1032 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1033 {
1034 db_error(ppdb, query);
1035 }
1036
1037 sqlite3_bind_int(ppstmt, 1, gid);
1038 sqlite3_bind_blob(ppstmt, 2, marshall.c_str(), marshall.length(), SQLITE_TRANSIENT);
1039 289
1040 if (sqlite3_step(ppstmt) != SQLITE_DONE) 290 // We only actually need to lookup verbs by sense key so we'll just
291 // ignore everything that isn't a verb.
292 std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$");
293 std::smatch relation_data;
294 if (!std::regex_search(line, relation_data, relation))
1041 { 295 {
1042 db_error(ppdb, query); 296 continue;
1043 } 297 }
298
299 int synset_id = stoi(relation_data[1]);
300 int wnum = stoi(relation_data[2]);
301 std::string sense_key = relation_data[3];
1044 302
1045 sqlite3_finalize(ppstmt); 303 // We are treating this mapping as injective, which is not entirely
1046 } 304 // accurate. First, the WordNet table contains duplicate rows, so those
1047 305 // need to be ignored. More importantly, a small number of sense keys
1048 for (auto member : mapping.second.members) 306 // (one for each letter of the Latin alphabet, plus 9 other words) each
1049 { 307 // map to two different words in the same synset which differ only by
1050 if (verbs.count(member) == 1) 308 // capitalization. Luckily, none of these exceptions are verbs, so we
309 // can pretend that the mapping is injective.
310 if (!wnSenseKeys_.count(sense_key))
1051 { 311 {
1052 auto& v = verbs[member]; 312 std::pair<int, int> lookup(synset_id, wnum);
1053 313 if (wordByWnidAndWnum_.count(lookup))
1054 query = "INSERT INTO verb_groups (verb_id, group_id) VALUES (?, ?)";
1055 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1056 {
1057 db_error(ppdb, query);
1058 }
1059
1060 sqlite3_bind_int(ppstmt, 1, v.id);
1061 sqlite3_bind_int(ppstmt, 2, gid);
1062
1063 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1064 { 314 {
1065 db_error(ppdb, query); 315 wnSenseKeys_[sense_key] = wordByWnidAndWnum_.at(lookup);
1066 } 316 }
1067
1068 sqlite3_finalize(ppstmt);
1069 } 317 }
1070 } 318 }
1071
1072 ppgs.update();
1073 } 319 }
1074 } 320
1075 321 void generator::readVerbNet()
1076 // Get nouns/adjectives/adverbs from WordNet
1077 // Useful relations:
1078 // - s: master list
1079 // - ant: antonymy (e.g. happy/sad, sad/happy, happiness/sadness)
1080 // - at: variation (e.g. a measurement can be standard or nonstandard)
1081 // - der: derivation (e.g. happy/happily, happily/happy)
1082 // - hyp: hypernymy/hyponymy (e.g. color/red, color/blue)
1083 // - ins: instantiation (do we need this? let's see)
1084 // - mm: member meronymy/holonymy (e.g. family/mother, family/child)
1085 // - mp: part meronymy/holonymy (e.g. wheel/spoke, wheel/tire)
1086 // - ms: substance meronymy/holonymy (e.g. tire/rubber, doorstop/rubber)
1087 // - per: pertainymy (e.g. something that is Alaskan pertains to Alaska)
1088 // mannernymy (e.g. something done quickly is done in a manner that is quick)
1089 // - sa: specification (e.g. inaccurate (general) can mean imprecise or incorrect (specific))
1090 // - sim: synonymy (e.g. cheerful/happy, happy/cheerful)
1091 // - syntax: positioning flags for some adjectives
1092 std::string wnpref {argv[3]};
1093 if (wnpref.back() != '/')
1094 {
1095 wnpref += '/';
1096 }
1097
1098 // s table
1099 {
1100 std::ifstream wnsfile(wnpref + "wn_s.pl");
1101 if (!wnsfile.is_open())
1102 { 322 {
1103 std::cout << "Invalid WordNet data directory." << std::endl; 323 std::cout << "Reading frames from VerbNet..." << std::endl;
1104 print_usage();
1105 }
1106 324
1107 std::list<std::string> lines; 325 DIR* dir;
1108 for (;;) 326 if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
1109 {
1110 std::string line;
1111 if (!getline(wnsfile, line))
1112 { 327 {
1113 break; 328 throw std::invalid_argument("Invalid VerbNet data directory");
1114 } 329 }
1115 330
1116 if (line.back() == '\r') 331 struct dirent* ent;
1117 { 332 while ((ent = readdir(dir)) != nullptr)
1118 line.pop_back();
1119 }
1120
1121 lines.push_back(line);
1122 }
1123
1124 progress ppgs("Writing nouns, adjectives, and adverbs...", lines.size());
1125 for (auto line : lines)
1126 {
1127 ppgs.update();
1128
1129 std::regex relation("^s\\(([134]\\d{8}),(\\d+),'(.+)',\\w,\\d+,\\d+\\)\\.$");
1130 std::smatch relation_data;
1131 if (!std::regex_search(line, relation_data, relation))
1132 { 333 {
1133 continue; 334 std::string filename(verbNetPath_);
1134 } 335
336 if (filename.back() != '/')
337 {
338 filename += '/';
339 }
1135 340
1136 int synset_id = stoi(relation_data[1]); 341 filename += ent->d_name;
1137 int wnum = stoi(relation_data[2]);
1138 std::string word = relation_data[3];
1139 size_t word_it;
1140 while ((word_it = word.find("''")) != std::string::npos)
1141 {
1142 word.erase(word_it, 1);
1143 }
1144 342
1145 std::string query; 343 if (filename.rfind(".xml") != filename.size() - 4)
1146 switch (synset_id / 100000000)
1147 {
1148 case 1: // Noun
1149 { 344 {
1150 if (nouns.count(word) == 1) 345 continue;
1151 {
1152 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid, plural) VALUES (?, ?, ?, ?, ?, ?)";
1153 } else {
1154 query = "INSERT INTO nouns (singular, proper, complexity, images, wnid) VALUES (?, ?, ?, ?, ?)";
1155 }
1156
1157 break;
1158 } 346 }
1159 347
1160 case 2: // Verb 348 xmlDocPtr doc = xmlParseFile(filename.c_str());
349 if (doc == nullptr)
1161 { 350 {
1162 // Ignore 351 throw std::logic_error("Error opening " + filename);
1163
1164 break;
1165 } 352 }
1166 353
1167 case 3: // Adjective 354 xmlNodePtr top = xmlDocGetRootElement(doc);
355 if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
1168 { 356 {
1169 if (adjectives.count(word) == 1) 357 throw std::logic_error("Bad VerbNet file format: " + filename);
1170 {
1171 query = "INSERT INTO adjectives (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)";
1172 } else {
1173 query = "INSERT INTO adjectives (base_form, complexity) VALUES (?, ?)";
1174 }
1175
1176 break;
1177 } 358 }
1178 359
1179 case 4: // Adverb 360 try
1180 { 361 {
1181 if (adjectives.count(word) == 1) 362 createGroup(top);
1182 { 363 } catch (const std::exception& e)
1183 query = "INSERT INTO adverbs (base_form, complexity, comparative, superlative) VALUES (?, ?, ?, ?)"; 364 {
1184 } else { 365 std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename));
1185 query = "INSERT INTO adverbs (base_form, complexity) VALUES (?, ?)";
1186 }
1187
1188 break;
1189 } 366 }
1190 } 367 }
368
369 closedir(dir);
370 }
1191 371
1192 sqlite3_stmt* ppstmt; 372 void generator::readAgidInflections()
1193 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK) 373 {
374 std::list<std::string> lines(readFile(agidPath_));
375 progress ppgs("Reading inflections from AGID...", lines.size());
376
377 for (std::string line : lines)
1194 { 378 {
1195 db_error(ppdb, query); 379 ppgs.update();
1196 } 380
381 int divider = line.find_first_of(" ");
382 std::string infinitive = line.substr(0, divider);
383 line = line.substr(divider+1);
384 char type = line[0];
1197 385
1198 sqlite3_bind_text(ppstmt, 1, word.c_str(), word.length(), SQLITE_TRANSIENT); 386 if (line[1] == '?')
1199 switch (synset_id / 100000000)
1200 {
1201 case 1: // Noun
1202 { 387 {
1203 sqlite3_bind_int(ppstmt, 2, (std::any_of(std::begin(word), std::end(word), [] (char ch) { 388 line.erase(0, 4);
1204 return isupper(ch); 389 } else {
1205 }) ? 1 : 0)); 390 line.erase(0, 3);
1206
1207 sqlite3_bind_int(ppstmt, 3, verbly::split<std::list<std::string>>(word, " ").size());
1208 sqlite3_bind_int(ppstmt, 4, images[synset_id]);
1209 sqlite3_bind_int(ppstmt, 5, synset_id);
1210
1211 if (nouns.count(word) == 1)
1212 {
1213 sqlite3_bind_text(ppstmt, 6, nouns[word].plural.c_str(), nouns[word].plural.length(), SQLITE_TRANSIENT);
1214 }
1215
1216 break;
1217 } 391 }
1218 392
1219 case 3: // Adjective 393 if (!lemmaByBaseForm_.count(infinitive) && (type != 'V'))
1220 case 4: // Adverb
1221 { 394 {
1222 sqlite3_bind_int(ppstmt, 2, verbly::split<std::list<std::string>>(word, " ").size()); 395 continue;
1223 396 }
1224 if (adjectives.count(word) == 1) 397
398 lemma& curLemma = lookupOrCreateLemma(infinitive);
399
400 auto forms = split<std::vector<std::string>>(line, " | ");
401 for (std::string& inflForm : forms)
402 {
403 int sympos = inflForm.find_first_of(",?");
404 if (sympos != std::string::npos)
1225 { 405 {
1226 sqlite3_bind_text(ppstmt, 3, adjectives[word].comparative.c_str(), adjectives[word].comparative.length(), SQLITE_TRANSIENT); 406 inflForm = inflForm.substr(0, sympos);
1227 sqlite3_bind_text(ppstmt, 4, adjectives[word].superlative.c_str(), adjectives[word].superlative.length(), SQLITE_TRANSIENT);
1228 } 407 }
1229
1230 break;
1231 } 408 }
1232 }
1233 409
1234 if (sqlite3_step(ppstmt) != SQLITE_DONE) 410 switch (type)
1235 {
1236 db_error(ppdb, query);
1237 }
1238
1239 sqlite3_finalize(ppstmt);
1240
1241 query = "SELECT last_insert_rowid()";
1242 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1243 {
1244 db_error(ppdb, query);
1245 }
1246
1247 if (sqlite3_step(ppstmt) != SQLITE_ROW)
1248 {
1249 db_error(ppdb, query);
1250 }
1251
1252 int rowid = sqlite3_column_int(ppstmt, 0);
1253 wn[synset_id][wnum] = rowid;
1254
1255 sqlite3_finalize(ppstmt);
1256
1257 std::string canonical(word);
1258 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1259 if (pronunciations.count(canonical) == 1)
1260 {
1261 for (auto pronunciation : pronunciations[canonical])
1262 { 411 {
1263 switch (synset_id / 100000000) 412 case 'V':
1264 { 413 {
1265 case 1: // Noun 414 if (forms.size() == 4)
1266 { 415 {
1267 if (!pronunciation.rhyme.empty()) 416 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1268 { 417 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1]));
1269 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 418 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2]));
1270 } else { 419 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3]));
1271 query = "INSERT INTO noun_pronunciations (noun_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 420 } else if (forms.size() == 3)
1272 }
1273
1274 break;
1275 }
1276
1277 case 3: // Adjective
1278 { 421 {
1279 if (!pronunciation.rhyme.empty()) 422 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
1280 { 423 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0]));
1281 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)"; 424 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1]));
1282 } else { 425 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2]));
1283 query = "INSERT INTO adjective_pronunciations (adjective_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)"; 426 } else if (forms.size() == 8)
1284 } 427 {
1285 428 // As of AGID 2014.08.11, this is only "to be"
1286 break; 429 curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
430 curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2]));
431 curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3]));
432 curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4]));
433 } else {
434 // Words that don't fit the cases above as of AGID 2014.08.11:
435 // - may and shall do not conjugate the way we want them to
436 // - methinks only has a past tense and is an outlier
437 // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
438 std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1287 } 439 }
1288 440
1289 case 4: // Adverb 441 // For verbs in particular, we sometimes create a notion and a word
442 // from inflection data. Specifically, if there are not yet any
443 // verbs existing that have the same infinitive form. "Yet" means
444 // that this verb appears in the AGID data but not in either WordNet
445 // or VerbNet.
446 if (!wordsByBaseForm_.count(infinitive)
447 || !std::any_of(std::begin(wordsByBaseForm_.at(infinitive)), std::end(wordsByBaseForm_.at(infinitive)), [] (word* w) {
448 return w->getNotion().getPartOfSpeech() == part_of_speech::verb;
449 }))
1290 { 450 {
1291 if (!pronunciation.rhyme.empty()) 451 notion& n = createNotion(part_of_speech::verb);
1292 { 452 createWord(n, curLemma);
1293 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress, prerhyme, rhyme) VALUES (?, ?, ?, ?, ?, ?)";
1294 } else {
1295 query = "INSERT INTO adverb_pronunciations (adverb_id, pronunciation, syllables, stress) VALUES (?, ?, ?, ?)";
1296 }
1297
1298 break;
1299 } 453 }
1300 }
1301
1302 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1303 {
1304 db_error(ppdb, query);
1305 }
1306
1307 sqlite3_bind_int(ppstmt, 1, rowid);
1308 sqlite3_bind_text(ppstmt, 2, pronunciation.phonemes.c_str(), pronunciation.phonemes.length(), SQLITE_TRANSIENT);
1309 sqlite3_bind_int(ppstmt, 3, pronunciation.syllables);
1310 sqlite3_bind_text(ppstmt, 4, pronunciation.stress.c_str(), pronunciation.stress.length(), SQLITE_TRANSIENT);
1311
1312 if (!pronunciation.rhyme.empty())
1313 {
1314 sqlite3_bind_text(ppstmt, 5, pronunciation.prerhyme.c_str(), pronunciation.prerhyme.length(), SQLITE_TRANSIENT);
1315 sqlite3_bind_text(ppstmt, 6, pronunciation.rhyme.c_str(), pronunciation.rhyme.length(), SQLITE_TRANSIENT);
1316 }
1317 454
1318 if (sqlite3_step(ppstmt) != SQLITE_DONE) 455 break;
1319 {
1320 db_error(ppdb, query);
1321 } 456 }
1322
1323 sqlite3_finalize(ppstmt);
1324 }
1325 }
1326 }
1327 }
1328
1329 // While we're working on s
1330 {
1331 progress ppgs("Writing word synonyms...", wn.size());
1332 for (auto sense : wn)
1333 {
1334 ppgs.update();
1335 457
1336 for (auto word1 : sense.second) 458 case 'A':
1337 {
1338 for (auto word2 : sense.second)
1339 {
1340 if (word1 != word2)
1341 { 459 {
1342 std::string query; 460 if (forms.size() == 2)
1343 switch (sense.first / 100000000)
1344 { 461 {
1345 case 1: // Noun 462 curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0]));
1346 { 463 curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1]));
1347 query = "INSERT INTO noun_synonymy (noun_1_id, noun_2_id) VALUES (?, ?)"; 464 } else {
1348 465 // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
1349 break; 466 std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1350 } 467 }
1351
1352 case 2: // Verb
1353 {
1354 // Ignore
1355
1356 break;
1357 }
1358
1359 case 3: // Adjective
1360 {
1361 query = "INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1362 468
1363 break; 469 break;
1364 } 470 }
1365 471
1366 case 4: // Adverb 472 case 'N':
1367 { 473 {
1368 query = "INSERT INTO adverb_synonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)"; 474 if (forms.size() == 1)
1369
1370 break;
1371 }
1372 }
1373
1374 sqlite3_stmt* ppstmt;
1375 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1376 {
1377 db_error(ppdb, query);
1378 }
1379
1380 sqlite3_bind_int(ppstmt, 1, word1.second);
1381 sqlite3_bind_int(ppstmt, 2, word2.second);
1382
1383 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1384 { 475 {
1385 db_error(ppdb, query); 476 curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0]));
477 } else {
478 // As of AGID 2014.08.11, this is non-existent.
479 std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
1386 } 480 }
1387 481
1388 sqlite3_finalize(ppstmt); 482 break;
1389 } 483 }
1390 } 484 }
1391 } 485 }
1392 } 486 }
1393 }
1394
1395 // ant table
1396 {
1397 std::ifstream wnantfile(wnpref + "wn_ant.pl");
1398 if (!wnantfile.is_open())
1399 {
1400 std::cout << "Invalid WordNet data directory." << std::endl;
1401 print_usage();
1402 }
1403
1404 std::list<std::string> lines;
1405 for (;;)
1406 {
1407 std::string line;
1408 if (!getline(wnantfile, line))
1409 {
1410 break;
1411 }
1412 487
1413 if (line.back() == '\r') 488 void generator::readPrepositions()
1414 {
1415 line.pop_back();
1416 }
1417
1418 lines.push_back(line);
1419 }
1420
1421 progress ppgs("Writing antonyms...", lines.size());
1422 for (auto line : lines)
1423 { 489 {
1424 ppgs.update(); 490 std::list<std::string> lines(readFile("prepositions.txt"));
491 progress ppgs("Reading prepositions...", lines.size());
1425 492
1426 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); 493 for (std::string line : lines)
1427 std::smatch relation_data;
1428 if (!std::regex_search(line, relation_data, relation))
1429 {
1430 continue;
1431 }
1432
1433 int synset_id_1 = stoi(relation_data[1]);
1434 int wnum_1 = stoi(relation_data[2]);
1435 int synset_id_2 = stoi(relation_data[3]);
1436 int wnum_2 = stoi(relation_data[4]);
1437
1438 std::string query;
1439 switch (synset_id_1 / 100000000)
1440 { 494 {
1441 case 1: // Noun 495 ppgs.update();
1442 {
1443 query = "INSERT INTO noun_antonymy (noun_1_id, noun_2_id) VALUES (?, ?)";
1444 496
1445 break; 497 std::regex relation("^([^:]+): (.+)");
1446 } 498 std::smatch relation_data;
1447 499 std::regex_search(line, relation_data, relation);
1448 case 2: // Verb 500 std::string prep = relation_data[1];
1449 { 501 auto groups = split<std::list<std::string>>(relation_data[2], ", ");
1450 // Ignore
1451 502
1452 break; 503 notion& n = createNotion(part_of_speech::preposition);
1453 } 504 lemma& l = lookupOrCreateLemma(prep);
1454 505 word& w = createWord(n, l);
1455 case 3: // Adjective
1456 {
1457 query = "INSERT INTO adjective_antonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)";
1458 506
1459 break; 507 n.setPrepositionGroups(groups);
1460 }
1461
1462 case 4: // Adverb
1463 {
1464 query = "INSERT INTO adverb_antonymy (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1465
1466 break;
1467 }
1468 }
1469
1470 sqlite3_stmt* ppstmt;
1471 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
1472 {
1473 db_error(ppdb, query);
1474 }
1475
1476 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1477 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1478
1479 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1480 {
1481 db_error(ppdb, query);
1482 }
1483
1484 sqlite3_finalize(ppstmt);
1485 }
1486 }
1487
1488 // at table
1489 {
1490 std::ifstream wnatfile(wnpref + "wn_at.pl");
1491 if (!wnatfile.is_open())
1492 {
1493 std::cout << "Invalid WordNet data directory." << std::endl;
1494 print_usage();
1495 }
1496
1497 std::list<std::string> lines;
1498 for (;;)
1499 {
1500 std::string line;
1501 if (!getline(wnatfile, line))
1502 {
1503 break;
1504 } 508 }
1505
1506 if (line.back() == '\r')
1507 {
1508 line.pop_back();
1509 }
1510
1511 lines.push_back(line);
1512 } 509 }
1513 510
1514 progress ppgs("Writing variations...", lines.size()); 511 void generator::readCmudictPronunciations()
1515 for (auto line : lines)
1516 { 512 {
1517 ppgs.update(); 513 std::list<std::string> lines(readFile(cmudictPath_));
514 progress ppgs("Reading pronunciations from CMUDICT...", lines.size());
1518 515
1519 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\."); 516 for (std::string line : lines)
1520 std::smatch relation_data;
1521 if (!std::regex_search(line, relation_data, relation))
1522 { 517 {
1523 continue; 518 ppgs.update();
1524 } 519
1525 520 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
1526 int synset_id_1 = stoi(relation_data[1]); 521 std::smatch phoneme_data;
1527 int synset_id_2 = stoi(relation_data[2]); 522 if (std::regex_search(line, phoneme_data, phoneme))
1528 std::string query("INSERT INTO variation (noun_id, adjective_id) VALUES (?, ?)");
1529
1530 for (auto mapping1 : wn[synset_id_1])
1531 {
1532 for (auto mapping2 : wn[synset_id_2])
1533 { 523 {
1534 sqlite3_stmt* ppstmt; 524 std::string canonical(phoneme_data[1]);
1535 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 525 std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
1536 {
1537 db_error(ppdb, query);
1538 }
1539
1540 sqlite3_bind_int(ppstmt, 1, mapping1.second);
1541 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1542 526
1543 if (sqlite3_step(ppstmt) != SQLITE_DONE) 527 if (!formByText_.count(canonical))
1544 { 528 {
1545 db_error(ppdb, query); 529 continue;
1546 } 530 }
1547 531
1548 sqlite3_finalize(ppstmt); 532 std::string phonemes = phoneme_data[2];
533 pronunciations_.emplace_back(phonemes);
534 pronunciation& p = pronunciations_.back();
535 formByText_.at(canonical)->addPronunciation(p);
1549 } 536 }
1550 } 537 }
1551 } 538 }
1552 }
1553
1554 // der table
1555 {
1556 std::ifstream wnderfile(wnpref + "wn_der.pl");
1557 if (!wnderfile.is_open())
1558 {
1559 std::cout << "Invalid WordNet data directory." << std::endl;
1560 print_usage();
1561 }
1562 539
1563 std::list<std::string> lines; 540 void generator::writeSchema()
1564 for (;;)
1565 { 541 {
1566 std::string line; 542 std::ifstream file("schema.sql");
1567 if (!getline(wnderfile, line)) 543 if (!file)
1568 { 544 {
1569 break; 545 throw std::invalid_argument("Could not find database schema");
1570 } 546 }
1571 547
1572 if (line.back() == '\r') 548 std::ostringstream schemaBuilder;
549 std::string line;
550 while (std::getline(file, line))
1573 { 551 {
1574 line.pop_back(); 552 if (line.back() == '\r')
553 {
554 line.pop_back();
555 }
556
557 schemaBuilder << line;
1575 } 558 }
1576 559
1577 lines.push_back(line); 560 std::string schema = schemaBuilder.str();
561 auto queries = split<std::list<std::string>>(schema, ";");
562 progress ppgs("Writing database schema...", queries.size());
563 for (std::string query : queries)
564 {
565 if (!queries.empty())
566 {
567 db_.runQuery(query);
568 }
569
570 ppgs.update();
571 }
1578 } 572 }
1579 573
1580 progress ppgs("Writing morphological derivation...", lines.size()); 574 void generator::dumpObjects()
1581 for (auto line : lines)
1582 { 575 {
1583 ppgs.update();
1584
1585 std::regex relation("^der\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1586 std::smatch relation_data;
1587 if (!std::regex_search(line, relation_data, relation))
1588 { 576 {
1589 continue; 577 progress ppgs("Writing notions...", notions_.size());
578
579 for (notion& n : notions_)
580 {
581 db_ << n;
582
583 ppgs.update();
584 }
1590 } 585 }
1591 586
1592 int synset_id_1 = stoi(relation_data[1]);
1593 int wnum_1 = stoi(relation_data[2]);
1594 int synset_id_2 = stoi(relation_data[3]);
1595 int wnum_2 = stoi(relation_data[4]);
1596 std::string query;
1597 switch (synset_id_1 / 100000000)
1598 { 587 {
1599 case 1: // Noun 588 progress ppgs("Writing words...", words_.size());
589
590 for (word& w : words_)
1600 { 591 {
1601 switch (synset_id_2 / 100000000) 592 db_ << w;
1602 {
1603 case 1: // Noun
1604 {
1605 query = "INSERT INTO noun_noun_derivation (noun_1_id, noun_2_id) VALUES (?, ?)";
1606 break;
1607 }
1608
1609 case 3: // Adjective
1610 {
1611 query = "INSERT INTO noun_adjective_derivation (noun_id, adjective_id) VALUES (?, ?)";
1612 break;
1613 }
1614
1615 case 4: // Adverb
1616 {
1617 query = "INSERT INTO noun_adverb_derivation (noun_id, adverb_id) VALUES (?, ?)";
1618 break;
1619 }
1620 }
1621 593
1622 break; 594 ppgs.update();
1623 } 595 }
596 }
597
598 {
599 progress ppgs("Writing lemmas...", lemmas_.size());
1624 600
1625 case 3: // Adjective 601 for (lemma& l : lemmas_)
1626 { 602 {
1627 switch (synset_id_2 / 100000000) 603 db_ << l;
1628 {
1629 case 1: // Noun
1630 {
1631 query = "INSERT INTO noun_adjective_derivation (adjective_id, noun_id) VALUES (?, ?)";
1632 break;
1633 }
1634
1635 case 3: // Adjective
1636 {
1637 query = "INSERT INTO adjective_adjective_derivation (adjective_id, adjective_id) VALUES (?, ?)";
1638 break;
1639 }
1640
1641 case 4: // Adverb
1642 {
1643 query = "INSERT INTO adjective_adverb_derivation (adjective_id, adverb_id) VALUES (?, ?)";
1644 break;
1645 }
1646 }
1647 604
1648 break; 605 ppgs.update();
1649 } 606 }
607 }
608
609 {
610 progress ppgs("Writing forms...", forms_.size());
1650 611
1651 case 4: // Adverb 612 for (form& f : forms_)
1652 { 613 {
1653 switch (synset_id_2 / 100000000) 614 db_ << f;
1654 {
1655 case 1: // Noun
1656 {
1657 query = "INSERT INTO noun_adverb_derivation (adverb_id, noun_id) VALUES (?, ?)";
1658 break;
1659 }
1660
1661 case 3: // Adjective
1662 {
1663 query = "INSERT INTO adjective_adverb_derivation (adverb_id, adjective_id) VALUES (?, ?)";
1664 break;
1665 }
1666
1667 case 4: // Adverb
1668 {
1669 query = "INSERT INTO adverb_adverb_derivation (adverb_1_id, adverb_2_id) VALUES (?, ?)";
1670 break;
1671 }
1672 }
1673 615
1674 break; 616 ppgs.update();
1675 } 617 }
1676 } 618 }
1677 619
1678 sqlite3_stmt* ppstmt;
1679 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
1680 { 620 {
1681 db_error(ppdb, query); 621 progress ppgs("Writing pronunciations...", pronunciations_.size());
622
623 for (pronunciation& p : pronunciations_)
624 {
625 db_ << p;
626
627 ppgs.update();
628 }
1682 } 629 }
1683 630
1684 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
1685 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
1686
1687 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1688 { 631 {
1689 db_error(ppdb, query); 632 progress ppgs("Writing verb groups...", groups_.size());
633
634 for (group& g : groups_)
635 {
636 db_ << g;
637
638 ppgs.update();
639 }
1690 } 640 }
1691 641
1692 sqlite3_finalize(ppstmt);
1693 }
1694 }
1695
1696 // hyp table
1697 {
1698 std::ifstream wnhypfile(wnpref + "wn_hyp.pl");
1699 if (!wnhypfile.is_open())
1700 {
1701 std::cout << "Invalid WordNet data directory." << std::endl;
1702 print_usage();
1703 }
1704
1705 std::list<std::string> lines;
1706 for (;;)
1707 {
1708 std::string line;
1709 if (!getline(wnhypfile, line))
1710 {
1711 break;
1712 }
1713
1714 if (line.back() == '\r')
1715 { 642 {
1716 line.pop_back(); 643 progress ppgs("Writing verb frames...", frames_.size());
644
645 for (frame& f : frames_)
646 {
647 db_ << f;
648
649 ppgs.update();
650 }
1717 } 651 }
1718
1719 lines.push_back(line);
1720 } 652 }
1721 653
1722 progress ppgs("Writing hypernyms...", lines.size()); 654 void generator::readWordNetAntonymy()
1723 for (auto line : lines)
1724 { 655 {
1725 ppgs.update(); 656 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
1726 657 progress ppgs("Writing antonyms...", lines.size());
1727 std::regex relation("^hyp\\((1\\d{8}),(1\\d{8})\\)\\."); 658 for (auto line : lines)
1728 std::smatch relation_data;
1729 if (!std::regex_search(line, relation_data, relation))
1730 { 659 {
1731 continue; 660 ppgs.update();
1732 }
1733
1734 int synset_id_1 = stoi(relation_data[1]);
1735 int synset_id_2 = stoi(relation_data[2]);
1736 std::string query("INSERT INTO hypernymy (hyponym_id, hypernym_id) VALUES (?, ?)");
1737 661
1738 for (auto mapping1 : wn[synset_id_1]) 662 std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
1739 { 663 std::smatch relation_data;
1740 for (auto mapping2 : wn[synset_id_2]) 664 if (!std::regex_search(line, relation_data, relation))
1741 { 665 {
1742 sqlite3_stmt* ppstmt; 666 continue;
1743 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 667 }
1744 { 668
1745 db_error(ppdb, query); 669 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1746 } 670 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1747 671
1748 sqlite3_bind_int(ppstmt, 1, mapping1.second); 672 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1749 sqlite3_bind_int(ppstmt, 2, mapping2.second); 673 {
674 word& word1 = *wordByWnidAndWnum_.at(lookup1);
675 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1750 676
1751 if (sqlite3_step(ppstmt) != SQLITE_DONE) 677 std::list<field> fields;
1752 { 678 fields.emplace_back("antonym_1_id", word1.getId());
1753 db_error(ppdb, query); 679 fields.emplace_back("antonym_2_id", word2.getId());
1754 }
1755 680
1756 sqlite3_finalize(ppstmt); 681 db_.insertIntoTable("antonymy", std::move(fields));
1757 } 682 }
1758 } 683 }
1759 } 684 }
1760 }
1761
1762 // ins table
1763 {
1764 std::ifstream wninsfile(wnpref + "wn_ins.pl");
1765 if (!wninsfile.is_open())
1766 {
1767 std::cout << "Invalid WordNet data directory." << std::endl;
1768 print_usage();
1769 }
1770
1771 std::list<std::string> lines;
1772 for (;;)
1773 {
1774 std::string line;
1775 if (!getline(wninsfile, line))
1776 {
1777 break;
1778 }
1779 685
1780 if (line.back() == '\r') 686 void generator::readWordNetVariation()
687 {
688 std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
689 progress ppgs("Writing variation...", lines.size());
690 for (auto line : lines)
1781 { 691 {
1782 line.pop_back(); 692 ppgs.update();
1783 }
1784 693
1785 lines.push_back(line); 694 std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
695 std::smatch relation_data;
696 if (!std::regex_search(line, relation_data, relation))
697 {
698 continue;
699 }
700
701 int lookup1 = std::stoi(relation_data[1]);
702 int lookup2 = std::stoi(relation_data[2]);
703
704 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
705 {
706 notion& notion1 = *notionByWnid_.at(lookup1);
707 notion& notion2 = *notionByWnid_.at(lookup2);
708
709 std::list<field> fields;
710 fields.emplace_back("noun_id", notion1.getId());
711 fields.emplace_back("adjective_id", notion2.getId());
712
713 db_.insertIntoTable("variation", std::move(fields));
714 }
715 }
1786 } 716 }
1787 717
1788 progress ppgs("Writing instantiations...", lines.size()); 718 void generator::readWordNetClasses()
1789 for (auto line : lines)
1790 { 719 {
1791 ppgs.update(); 720 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
1792 721 progress ppgs("Writing usage, topicality, and regionality...", lines.size());
1793 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\."); 722 for (auto line : lines)
1794 std::smatch relation_data;
1795 if (!std::regex_search(line, relation_data, relation))
1796 { 723 {
1797 continue; 724 ppgs.update();
1798 }
1799
1800 int synset_id_1 = stoi(relation_data[1]);
1801 int synset_id_2 = stoi(relation_data[2]);
1802 std::string query("INSERT INTO instantiation (instance_id, class_id) VALUES (?, ?)");
1803 725
1804 for (auto mapping1 : wn[synset_id_1]) 726 std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
1805 { 727 std::smatch relation_data;
1806 for (auto mapping2 : wn[synset_id_2]) 728 if (!std::regex_search(line, relation_data, relation))
729 {
730 continue;
731 }
732
733 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
734 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
735 std::string class_type = relation_data[5];
736
737 std::string table_name;
738 if (class_type == "t")
739 {
740 table_name += "topicality";
741 } else if (class_type == "u")
742 {
743 table_name += "usage";
744 } else if (class_type == "r")
745 {
746 table_name += "regionality";
747 }
748
749 std::list<int> leftJoin;
750 std::list<int> rightJoin;
751
752 if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
1807 { 753 {
1808 sqlite3_stmt* ppstmt; 754 std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) {
1809 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 755 return w->getId();
756 });
757 } else if (wordByWnidAndWnum_.count(lookup1)) {
758 leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
759 }
760
761 if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
762 {
763 std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) {
764 return w->getId();
765 });
766 } else if (wordByWnidAndWnum_.count(lookup2)) {
767 rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
768 }
769
770 for (int word1 : leftJoin)
771 {
772 for (int word2 : rightJoin)
1810 { 773 {
1811 db_error(ppdb, query); 774 std::list<field> fields;
1812 } 775 fields.emplace_back("term_id", word1);
776 fields.emplace_back("domain_id", word2);
1813 777
1814 sqlite3_bind_int(ppstmt, 1, mapping1.second); 778 db_.insertIntoTable(table_name, std::move(fields));
1815 sqlite3_bind_int(ppstmt, 2, mapping2.second);
1816
1817 if (sqlite3_step(ppstmt) != SQLITE_DONE)
1818 {
1819 db_error(ppdb, query);
1820 } 779 }
1821
1822 sqlite3_finalize(ppstmt);
1823 } 780 }
1824 } 781 }
1825 } 782 }
1826 }
1827
1828 // mm table
1829 {
1830 std::ifstream wnmmfile(wnpref + "wn_mm.pl");
1831 if (!wnmmfile.is_open())
1832 {
1833 std::cout << "Invalid WordNet data directory." << std::endl;
1834 print_usage();
1835 }
1836
1837 std::list<std::string> lines;
1838 for (;;)
1839 {
1840 std::string line;
1841 if (!getline(wnmmfile, line))
1842 {
1843 break;
1844 }
1845 783
1846 if (line.back() == '\r') 784 void generator::readWordNetCausality()
785 {
786 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
787 progress ppgs("Writing causality...", lines.size());
788 for (auto line : lines)
1847 { 789 {
1848 line.pop_back(); 790 ppgs.update();
1849 }
1850 791
1851 lines.push_back(line); 792 std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\.");
793 std::smatch relation_data;
794 if (!std::regex_search(line, relation_data, relation))
795 {
796 continue;
797 }
798
799 int lookup1 = std::stoi(relation_data[1]);
800 int lookup2 = std::stoi(relation_data[2]);
801
802 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
803 {
804 notion& notion1 = *notionByWnid_.at(lookup1);
805 notion& notion2 = *notionByWnid_.at(lookup2);
806
807 std::list<field> fields;
808 fields.emplace_back("effect_id", notion1.getId());
809 fields.emplace_back("cause_id", notion2.getId());
810
811 db_.insertIntoTable("causality", std::move(fields));
812 }
813 }
1852 } 814 }
1853 815
1854 progress ppgs("Writing member meronyms...", lines.size()); 816 void generator::readWordNetEntailment()
1855 for (auto line : lines)
1856 { 817 {
1857 ppgs.update(); 818 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
1858 819 progress ppgs("Writing entailment...", lines.size());
1859 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); 820 for (auto line : lines)
1860 std::smatch relation_data;
1861 if (!std::regex_search(line, relation_data, relation))
1862 { 821 {
1863 continue; 822 ppgs.update();
1864 }
1865 823
1866 int synset_id_1 = stoi(relation_data[1]); 824 std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\.");
1867 int synset_id_2 = stoi(relation_data[2]); 825 std::smatch relation_data;
1868 std::string query("INSERT INTO member_meronymy (holonym_id, meronym_id) VALUES (?, ?)"); 826 if (!std::regex_search(line, relation_data, relation))
1869
1870 for (auto mapping1 : wn[synset_id_1])
1871 {
1872 for (auto mapping2 : wn[synset_id_2])
1873 { 827 {
1874 sqlite3_stmt* ppstmt; 828 continue;
1875 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 829 }
1876 { 830
1877 db_error(ppdb, query); 831 int lookup1 = std::stoi(relation_data[1]);
1878 } 832 int lookup2 = std::stoi(relation_data[2]);
1879 833
1880 sqlite3_bind_int(ppstmt, 1, mapping1.second); 834 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
1881 sqlite3_bind_int(ppstmt, 2, mapping2.second); 835 {
836 notion& notion1 = *notionByWnid_.at(lookup1);
837 notion& notion2 = *notionByWnid_.at(lookup2);
1882 838
1883 if (sqlite3_step(ppstmt) != SQLITE_DONE) 839 std::list<field> fields;
1884 { 840 fields.emplace_back("given_id", notion1.getId());
1885 db_error(ppdb, query); 841 fields.emplace_back("entailment_id", notion2.getId());
1886 }
1887 842
1888 sqlite3_finalize(ppstmt); 843 db_.insertIntoTable("entailment", std::move(fields));
1889 } 844 }
1890 } 845 }
1891 } 846 }
1892 } 847
1893 848 void generator::readWordNetHypernymy()
1894 // ms table
1895 {
1896 std::ifstream wnmsfile(wnpref + "wn_ms.pl");
1897 if (!wnmsfile.is_open())
1898 {
1899 std::cout << "Invalid WordNet data directory." << std::endl;
1900 print_usage();
1901 }
1902
1903 std::list<std::string> lines;
1904 for (;;)
1905 { 849 {
1906 std::string line; 850 std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
1907 if (!getline(wnmsfile, line)) 851 progress ppgs("Writing hypernymy...", lines.size());
852 for (auto line : lines)
1908 { 853 {
1909 break; 854 ppgs.update();
855
856 std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\.");
857 std::smatch relation_data;
858 if (!std::regex_search(line, relation_data, relation))
859 {
860 continue;
861 }
862
863 int lookup1 = std::stoi(relation_data[1]);
864 int lookup2 = std::stoi(relation_data[2]);
865
866 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
867 {
868 notion& notion1 = *notionByWnid_.at(lookup1);
869 notion& notion2 = *notionByWnid_.at(lookup2);
870
871 std::list<field> fields;
872 fields.emplace_back("hyponym_id", notion1.getId());
873 fields.emplace_back("hypernym_id", notion2.getId());
874
875 db_.insertIntoTable("hypernymy", std::move(fields));
876 }
1910 } 877 }
878 }
1911 879
1912 if (line.back() == '\r') 880 void generator::readWordNetInstantiation()
881 {
882 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
883 progress ppgs("Writing instantiation...", lines.size());
884 for (auto line : lines)
1913 { 885 {
1914 line.pop_back(); 886 ppgs.update();
1915 }
1916 887
1917 lines.push_back(line); 888 std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
889 std::smatch relation_data;
890 if (!std::regex_search(line, relation_data, relation))
891 {
892 continue;
893 }
894
895 int lookup1 = std::stoi(relation_data[1]);
896 int lookup2 = std::stoi(relation_data[2]);
897
898 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
899 {
900 notion& notion1 = *notionByWnid_.at(lookup1);
901 notion& notion2 = *notionByWnid_.at(lookup2);
902
903 std::list<field> fields;
904 fields.emplace_back("instance_id", notion1.getId());
905 fields.emplace_back("class_id", notion2.getId());
906
907 db_.insertIntoTable("instantiation", std::move(fields));
908 }
909 }
1918 } 910 }
1919 911
1920 progress ppgs("Writing substance meronyms...", lines.size()); 912 void generator::readWordNetMemberMeronymy()
1921 for (auto line : lines)
1922 { 913 {
1923 ppgs.update(); 914 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
1924 915 progress ppgs("Writing member meronymy...", lines.size());
1925 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); 916 for (auto line : lines)
1926 std::smatch relation_data;
1927 if (!std::regex_search(line, relation_data, relation))
1928 { 917 {
1929 continue; 918 ppgs.update();
1930 }
1931
1932 int synset_id_1 = stoi(relation_data[1]);
1933 int synset_id_2 = stoi(relation_data[2]);
1934 std::string query("INSERT INTO substance_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
1935 919
1936 for (auto mapping1 : wn[synset_id_1]) 920 std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
1937 { 921 std::smatch relation_data;
1938 for (auto mapping2 : wn[synset_id_2]) 922 if (!std::regex_search(line, relation_data, relation))
1939 { 923 {
1940 sqlite3_stmt* ppstmt; 924 continue;
1941 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 925 }
1942 { 926
1943 db_error(ppdb, query); 927 int lookup1 = std::stoi(relation_data[1]);
1944 } 928 int lookup2 = std::stoi(relation_data[2]);
929
930 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
931 {
932 notion& notion1 = *notionByWnid_.at(lookup1);
933 notion& notion2 = *notionByWnid_.at(lookup2);
1945 934
1946 sqlite3_bind_int(ppstmt, 1, mapping1.second); 935 std::list<field> fields;
1947 sqlite3_bind_int(ppstmt, 2, mapping2.second); 936 fields.emplace_back("holonym_id", notion1.getId());
937 fields.emplace_back("meronym_id", notion2.getId());
1948 938
1949 if (sqlite3_step(ppstmt) != SQLITE_DONE) 939 db_.insertIntoTable("member_meronymy", std::move(fields));
1950 {
1951 db_error(ppdb, query);
1952 }
1953
1954 sqlite3_finalize(ppstmt);
1955 } 940 }
1956 } 941 }
1957 } 942 }
1958 } 943
1959 944 void generator::readWordNetPartMeronymy()
1960 // mm table
1961 {
1962 std::ifstream wnmpfile(wnpref + "wn_mp.pl");
1963 if (!wnmpfile.is_open())
1964 {
1965 std::cout << "Invalid WordNet data directory." << std::endl;
1966 print_usage();
1967 }
1968
1969 std::list<std::string> lines;
1970 for (;;)
1971 { 945 {
1972 std::string line; 946 std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
1973 if (!getline(wnmpfile, line)) 947 progress ppgs("Writing part meronymy...", lines.size());
948 for (auto line : lines)
1974 { 949 {
1975 break; 950 ppgs.update();
951
952 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
953 std::smatch relation_data;
954 if (!std::regex_search(line, relation_data, relation))
955 {
956 continue;
957 }
958
959 int lookup1 = std::stoi(relation_data[1]);
960 int lookup2 = std::stoi(relation_data[2]);
961
962 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
963 {
964 notion& notion1 = *notionByWnid_.at(lookup1);
965 notion& notion2 = *notionByWnid_.at(lookup2);
966
967 std::list<field> fields;
968 fields.emplace_back("holonym_id", notion1.getId());
969 fields.emplace_back("meronym_id", notion2.getId());
970
971 db_.insertIntoTable("part_meronymy", std::move(fields));
972 }
1976 } 973 }
974 }
1977 975
1978 if (line.back() == '\r') 976 void generator::readWordNetSubstanceMeronymy()
977 {
978 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
979 progress ppgs("Writing substance meronymy...", lines.size());
980 for (auto line : lines)
1979 { 981 {
1980 line.pop_back(); 982 ppgs.update();
1981 }
1982 983
1983 lines.push_back(line); 984 std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
985 std::smatch relation_data;
986 if (!std::regex_search(line, relation_data, relation))
987 {
988 continue;
989 }
990
991 int lookup1 = std::stoi(relation_data[1]);
992 int lookup2 = std::stoi(relation_data[2]);
993
994 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
995 {
996 notion& notion1 = *notionByWnid_.at(lookup1);
997 notion& notion2 = *notionByWnid_.at(lookup2);
998
999 std::list<field> fields;
1000 fields.emplace_back("holonym_id", notion1.getId());
1001 fields.emplace_back("meronym_id", notion2.getId());
1002
1003 db_.insertIntoTable("substance_meronymy", std::move(fields));
1004 }
1005 }
1984 } 1006 }
1985 1007
1986 progress ppgs("Writing part meronyms...", lines.size()); 1008 void generator::readWordNetPertainymy()
1987 for (auto line : lines)
1988 { 1009 {
1989 ppgs.update(); 1010 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
1990 1011 progress ppgs("Writing pertainymy and mannernymy...", lines.size());
1991 std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); 1012 for (auto line : lines)
1992 std::smatch relation_data;
1993 if (!std::regex_search(line, relation_data, relation))
1994 { 1013 {
1995 continue; 1014 ppgs.update();
1996 }
1997
1998 int synset_id_1 = stoi(relation_data[1]);
1999 int synset_id_2 = stoi(relation_data[2]);
2000 std::string query("INSERT INTO part_meronymy (holonym_id, meronym_id) VALUES (?, ?)");
2001 1015
2002 for (auto mapping1 : wn[synset_id_1]) 1016 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
2003 { 1017 std::smatch relation_data;
2004 for (auto mapping2 : wn[synset_id_2]) 1018 if (!std::regex_search(line, relation_data, relation))
2005 { 1019 {
2006 sqlite3_stmt* ppstmt; 1020 continue;
2007 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK) 1021 }
2008 { 1022
2009 db_error(ppdb, query); 1023 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
2010 } 1024 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1025
1026 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1027 {
1028 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1029 word& word2 = *wordByWnidAndWnum_.at(lookup2);
2011 1030
2012 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1031 if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective)
2013 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1032 {
1033 std::list<field> fields;
1034 fields.emplace_back("pertainym_id", word1.getId());
1035 fields.emplace_back("noun_id", word2.getId());
2014 1036
2015 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1037 db_.insertIntoTable("pertainymy", std::move(fields));
1038 } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb)
2016 { 1039 {
2017 db_error(ppdb, query); 1040 std::list<field> fields;
2018 } 1041 fields.emplace_back("mannernym_id", word1.getId());
1042 fields.emplace_back("adjective_id", word2.getId());
2019 1043
2020 sqlite3_finalize(ppstmt); 1044 db_.insertIntoTable("mannernymy", std::move(fields));
1045 }
2021 } 1046 }
2022 } 1047 }
2023 } 1048 }
2024 }
2025
2026 // per table
2027 {
2028 std::ifstream wnperfile(wnpref + "wn_per.pl");
2029 if (!wnperfile.is_open())
2030 {
2031 std::cout << "Invalid WordNet data directory." << std::endl;
2032 print_usage();
2033 }
2034
2035 std::list<std::string> lines;
2036 for (;;)
2037 {
2038 std::string line;
2039 if (!getline(wnperfile, line))
2040 {
2041 break;
2042 }
2043 1049
2044 if (line.back() == '\r') 1050 void generator::readWordNetSpecification()
1051 {
1052 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
1053 progress ppgs("Writing specifications...", lines.size());
1054 for (auto line : lines)
2045 { 1055 {
2046 line.pop_back(); 1056 ppgs.update();
1057
1058 std::regex relation("^sa\\((23\\d{8}),(\\d+),(23\\d{8}),(\\d+)\\)\\.");
1059 std::smatch relation_data;
1060 if (!std::regex_search(line, relation_data, relation))
1061 {
1062 continue;
1063 }
1064
1065 std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
1066 std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
1067
1068 if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
1069 {
1070 word& word1 = *wordByWnidAndWnum_.at(lookup1);
1071 word& word2 = *wordByWnidAndWnum_.at(lookup2);
1072
1073 std::list<field> fields;
1074 fields.emplace_back("general_id", word1.getId());
1075 fields.emplace_back("specific_id", word2.getId());
1076
1077 db_.insertIntoTable("specification", std::move(fields));
1078 }
2047 } 1079 }
2048
2049 lines.push_back(line);
2050 } 1080 }
2051 1081
2052 progress ppgs("Writing pertainyms and mannernyms...", lines.size()); 1082 void generator::readWordNetSimilarity()
2053 for (auto line : lines)
2054 { 1083 {
2055 ppgs.update(); 1084 std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
2056 1085 progress ppgs("Writing adjective similarity...", lines.size());
2057 std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\."); 1086 for (auto line : lines)
2058 std::smatch relation_data;
2059 if (!std::regex_search(line, relation_data, relation))
2060 { 1087 {
2061 continue; 1088 ppgs.update();
2062 }
2063 1089
2064 int synset_id_1 = stoi(relation_data[1]); 1090 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
2065 int wnum_1 = stoi(relation_data[2]); 1091 std::smatch relation_data;
2066 int synset_id_2 = stoi(relation_data[3]); 1092 if (!std::regex_search(line, relation_data, relation))
2067 int wnum_2 = stoi(relation_data[4]);
2068 std::string query;
2069 switch (synset_id_1 / 100000000)
2070 {
2071 case 3: // Adjective
2072 { 1093 {
2073 // This is a pertainym, the second word should be a noun 1094 continue;
2074 // Technically it can be an adjective but we're ignoring that
2075 if (synset_id_2 / 100000000 != 1)
2076 {
2077 continue;
2078 }
2079
2080 query = "INSERT INTO pertainymy (pertainym_id, noun_id) VALUES (?, ?)";
2081
2082 break;
2083 } 1095 }
1096
1097 int lookup1 = std::stoi(relation_data[1]);
1098 int lookup2 = std::stoi(relation_data[2]);
2084 1099
2085 case 4: // Adverb 1100 if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
2086 { 1101 {
2087 // This is a mannernym, the second word should be an adjective 1102 notion& notion1 = *notionByWnid_.at(lookup1);
2088 if (synset_id_2 / 100000000 != 3) 1103 notion& notion2 = *notionByWnid_.at(lookup2);
2089 {
2090 continue;
2091 }
2092 1104
2093 query = "INSERT INTO mannernymy (mannernym_id, adjective_id) VALUES (?, ?)"; 1105 std::list<field> fields;
1106 fields.emplace_back("adjective_1_id", notion1.getId());
1107 fields.emplace_back("adjective_2_id", notion2.getId());
2094 1108
2095 break; 1109 db_.insertIntoTable("similarity", std::move(fields));
2096 } 1110 }
2097 } 1111 }
2098 1112 }
2099 sqlite3_stmt* ppstmt;
2100 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2101 {
2102 db_error(ppdb, query);
2103 }
2104
2105 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]);
2106 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]);
2107 1113
2108 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1114 std::list<std::string> generator::readFile(std::string path)
1115 {
1116 std::ifstream file(path);
1117 if (!file)
2109 { 1118 {
2110 db_error(ppdb, query); 1119 throw std::invalid_argument("Could not find file " + path);
2111 } 1120 }
2112
2113 sqlite3_finalize(ppstmt);
2114 }
2115 }
2116 1121
2117 // sa table 1122 std::list<std::string> lines;
2118 {
2119 std::ifstream wnsafile(wnpref + "wn_sa.pl");
2120 if (!wnsafile.is_open())
2121 {
2122 std::cout << "Invalid WordNet data directory." << std::endl;
2123 print_usage();
2124 }
2125
2126 std::list<std::string> lines;
2127 for (;;)
2128 {
2129 std::string line; 1123 std::string line;
2130 if (!getline(wnsafile, line)) 1124 while (std::getline(file, line))
2131 {
2132 break;
2133 }
2134
2135 if (line.back() == '\r')
2136 { 1125 {
2137 line.pop_back(); 1126 if (line.back() == '\r')
1127 {
1128 line.pop_back();
1129 }
1130
1131 lines.push_back(line);
2138 } 1132 }
2139 1133
2140 lines.push_back(line); 1134 return lines;
2141 } 1135 }
2142 1136
2143 progress ppgs("Writing specifications...", lines.size()); 1137 part_of_speech generator::partOfSpeechByWnid(int wnid)
2144 for (auto line : lines)
2145 { 1138 {
2146 ppgs.update(); 1139 switch (wnid / 100000000)
2147
2148 std::regex relation("^per\\((3\\d{8}),(\\d+),(3\\d{8}),(\\d+)\\)\\.");
2149 std::smatch relation_data;
2150 if (!std::regex_search(line, relation_data, relation))
2151 {
2152 continue;
2153 }
2154
2155 int synset_id_1 = stoi(relation_data[1]);
2156 int wnum_1 = stoi(relation_data[2]);
2157 int synset_id_2 = stoi(relation_data[3]);
2158 int wnum_2 = stoi(relation_data[4]);
2159 std::string query("INSERT INTO specification (general_id, specific_id) VALUES (?, ?)");
2160
2161 sqlite3_stmt* ppstmt;
2162 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.length(), &ppstmt, NULL) != SQLITE_OK)
2163 { 1140 {
2164 db_error(ppdb, query); 1141 case 1: return part_of_speech::noun;
1142 case 2: return part_of_speech::verb;
1143 case 3: return part_of_speech::adjective;
1144 case 4: return part_of_speech::adverb;
1145 default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid));
2165 } 1146 }
1147 }
2166 1148
2167 sqlite3_bind_int(ppstmt, 1, wn[synset_id_1][wnum_1]); 1149 notion& generator::createNotion(part_of_speech partOfSpeech)
2168 sqlite3_bind_int(ppstmt, 2, wn[synset_id_2][wnum_2]); 1150 {
1151 notions_.emplace_back(partOfSpeech);
1152
1153 return notions_.back();
1154 }
2169 1155
2170 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1156 notion& generator::lookupOrCreateNotion(int wnid)
1157 {
1158 if (!notionByWnid_.count(wnid))
2171 { 1159 {
2172 db_error(ppdb, query); 1160 notions_.emplace_back(partOfSpeechByWnid(wnid), wnid);
1161 notionByWnid_[wnid] = &notions_.back();
2173 } 1162 }
2174 1163
2175 sqlite3_finalize(ppstmt); 1164 return *notionByWnid_.at(wnid);
2176 }
2177 }
2178
2179 // sim table
2180 {
2181 std::ifstream wnsimfile(wnpref + "wn_sim.pl");
2182 if (!wnsimfile.is_open())
2183 {
2184 std::cout << "Invalid WordNet data directory." << std::endl;
2185 print_usage();
2186 } 1165 }
2187 1166
2188 std::list<std::string> lines; 1167 lemma& generator::lookupOrCreateLemma(std::string base_form)
2189 for (;;)
2190 { 1168 {
2191 std::string line; 1169 if (!lemmaByBaseForm_.count(base_form))
2192 if (!getline(wnsimfile, line))
2193 { 1170 {
2194 break; 1171 lemmas_.emplace_back(lookupOrCreateForm(base_form));
1172 lemmaByBaseForm_[base_form] = &lemmas_.back();
2195 } 1173 }
1174
1175 return *lemmaByBaseForm_.at(base_form);
1176 }
2196 1177
2197 if (line.back() == '\r') 1178 form& generator::lookupOrCreateForm(std::string text)
1179 {
1180 if (!formByText_.count(text))
2198 { 1181 {
2199 line.pop_back(); 1182 forms_.emplace_back(text);
1183 formByText_[text] = &forms_.back();
2200 } 1184 }
2201 1185
2202 lines.push_back(line); 1186 return *formByText_[text];
2203 } 1187 }
2204 1188
2205 progress ppgs("Writing sense synonyms...", lines.size()); 1189 template <typename... Args> word& generator::createWord(Args&&... args)
2206 for (auto line : lines)
2207 { 1190 {
2208 ppgs.update(); 1191 words_.emplace_back(std::forward<Args>(args)...);
1192 word& w = words_.back();
2209 1193
2210 std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\."); 1194 wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w);
2211 std::smatch relation_data; 1195
2212 if (!std::regex_search(line, relation_data, relation)) 1196 if (w.getNotion().hasWnid())
2213 { 1197 {
2214 continue; 1198 wordsByWnid_[w.getNotion().getWnid()].insert(&w);
2215 } 1199 }
2216 1200
2217 int synset_id_1 = stoi(relation_data[1]); 1201 return w;
2218 int synset_id_2 = stoi(relation_data[2]); 1202 }
2219 std::string query("INSERT INTO adjective_synonymy (adjective_1_id, adjective_2_id) VALUES (?, ?)"); 1203
1204 group& generator::createGroup(xmlNodePtr top)
1205 {
1206 groups_.emplace_back();
1207 group& grp = groups_.back();
2220 1208
2221 for (auto mapping1 : wn[synset_id_1]) 1209 xmlChar* key;
1210
1211 for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
2222 { 1212 {
2223 for (auto mapping2 : wn[synset_id_2]) 1213 if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES")))
2224 { 1214 {
2225 sqlite3_stmt* ppstmt; 1215 for (xmlNodePtr subclass = node->xmlChildrenNode; subclass != nullptr; subclass = subclass->next)
2226 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2227 { 1216 {
2228 db_error(ppdb, query); 1217 if (!xmlStrcmp(subclass->name, reinterpret_cast<const xmlChar*>("VNSUBCLASS")))
1218 {
1219 try
1220 {
1221 group& subgrp = createGroup(subclass);
1222 subgrp.setParent(grp);
1223 } catch (const std::exception& e)
1224 {
1225 key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID"));
1226
1227 if (key == nullptr)
1228 {
1229 std::throw_with_nested(std::logic_error("Error parsing IDless subgroup"));
1230 } else {
1231 std::string subgroupId(reinterpret_cast<const char*>(key));
1232 xmlFree(key);
1233
1234 std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId));
1235 }
1236 }
1237 }
2229 } 1238 }
2230 1239 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("MEMBERS")))
2231 sqlite3_bind_int(ppstmt, 1, mapping1.second); 1240 {
2232 sqlite3_bind_int(ppstmt, 2, mapping2.second); 1241 for (xmlNodePtr member = node->xmlChildrenNode; member != nullptr; member = member->next)
2233
2234 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2235 { 1242 {
2236 db_error(ppdb, query); 1243 if (!xmlStrcmp(member->name, reinterpret_cast<const xmlChar*>("MEMBER")))
1244 {
1245 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn"));
1246 std::string wnSenses(reinterpret_cast<const char*>(key));
1247 xmlFree(key);
1248
1249 auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " ");
1250 if (!wnSenseKeys.empty())
1251 {
1252 std::list<std::string> tempKeys;
1253
1254 std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) {
1255 return sense + "::";
1256 });
1257
1258 std::list<std::string> filteredKeys;
1259
1260 std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) {
1261 return !wnSenseKeys_.count(sense);
1262 });
1263
1264 wnSenseKeys = std::move(filteredKeys);
1265 }
1266
1267 if (!wnSenseKeys.empty())
1268 {
1269 for (std::string sense : wnSenseKeys)
1270 {
1271 word& wordSense = *wnSenseKeys_[sense];
1272 wordSense.setVerbGroup(grp);
1273 }
1274 } else {
1275 key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name"));
1276 std::string memberName(reinterpret_cast<const char*>(key));
1277 xmlFree(key);
1278
1279 notion& n = createNotion(part_of_speech::verb);
1280 lemma& l = lookupOrCreateLemma(memberName);
1281 word& w = createWord(n, l);
1282
1283 w.setVerbGroup(grp);
1284 }
1285 }
2237 } 1286 }
2238 1287 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("THEMROLES")))
2239 sqlite3_reset(ppstmt); 1288 {
2240 sqlite3_clear_bindings(ppstmt); 1289 for (xmlNodePtr roletopnode = node->xmlChildrenNode; roletopnode != nullptr; roletopnode = roletopnode->next)
2241
2242 sqlite3_bind_int(ppstmt, 1, mapping2.second);
2243 sqlite3_bind_int(ppstmt, 2, mapping1.second);
2244
2245 if (sqlite3_step(ppstmt) != SQLITE_DONE)
2246 { 1290 {
2247 db_error(ppdb, query); 1291 if (!xmlStrcmp(roletopnode->name, reinterpret_cast<const xmlChar*>("THEMROLE")))
1292 {
1293 role r;
1294
1295 key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type"));
1296 std::string roleName = reinterpret_cast<const char*>(key);
1297 xmlFree(key);
1298
1299 for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
1300 {
1301 if (!xmlStrcmp(rolenode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1302 {
1303 r.setSelrestrs(parseSelrestr(rolenode));
1304 }
1305 }
1306
1307 grp.addRole(roleName, std::move(r));
1308 }
2248 } 1309 }
1310 } else if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("FRAMES")))
1311 {
1312 for (xmlNodePtr frametopnode = node->xmlChildrenNode; frametopnode != nullptr; frametopnode = frametopnode->next)
1313 {
1314 if (!xmlStrcmp(frametopnode->name, reinterpret_cast<const xmlChar*>("FRAME")))
1315 {
1316 frames_.emplace_back();
1317 frame& fr = frames_.back();
2249 1318
2250 sqlite3_finalize(ppstmt); 1319 for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
1320 {
1321 if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX")))
1322 {
1323 for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
1324 {
1325 if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP")))
1326 {
1327 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1328 std::string partRole = reinterpret_cast<const char*>(key);
1329 xmlFree(key);
1330
1331 selrestr partSelrestrs;
1332 std::set<std::string> partSynrestrs;
1333
1334 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1335 {
1336 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SYNRESTRS")))
1337 {
1338 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1339 {
1340 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SYNRESTR")))
1341 {
1342 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1343 partSynrestrs.insert(reinterpret_cast<const char*>(key));
1344 xmlFree(key);
1345 }
1346 }
1347 }
1348
1349 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1350 {
1351 partSelrestrs = parseSelrestr(npnode);
1352 }
1353 }
1354
1355 fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs)));
1356 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB")))
1357 {
1358 fr.push_back(part::createVerb());
1359 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("PREP")))
1360 {
1361 std::set<std::string> partChoices;
1362 bool partLiteral;
1363
1364 if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")))
1365 {
1366 partLiteral = true;
1367
1368 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1369 std::string choicesStr = reinterpret_cast<const char*>(key);
1370 xmlFree(key);
1371
1372 split(choicesStr, " ", std::inserter(partChoices, std::end(partChoices)));
1373 } else {
1374 partLiteral = false;
1375
1376 for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
1377 {
1378 if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
1379 {
1380 for (xmlNodePtr synrestr = npnode->xmlChildrenNode; synrestr != nullptr; synrestr = synrestr->next)
1381 {
1382 if (!xmlStrcmp(synrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1383 {
1384 key = xmlGetProp(synrestr, reinterpret_cast<const xmlChar*>("type"));
1385 partChoices.insert(reinterpret_cast<const char*>(key));
1386 xmlFree(key);
1387 }
1388 }
1389 }
1390 }
1391 }
1392
1393 fr.push_back(part::createPreposition(std::move(partChoices), partLiteral));
1394 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ")))
1395 {
1396 fr.push_back(part::createAdjective());
1397 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADV")))
1398 {
1399 fr.push_back(part::createAdverb());
1400 } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("LEX")))
1401 {
1402 key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
1403 std::string literalValue = reinterpret_cast<const char*>(key);
1404 xmlFree(key);
1405
1406 fr.push_back(part::createLiteral(literalValue));
1407 } else {
1408 continue;
1409 }
1410 }
1411
1412 grp.addFrame(fr);
1413 }
1414 }
1415 }
1416 }
2251 } 1417 }
2252 } 1418 }
2253 }
2254 }
2255
2256 // syntax table
2257 {
2258 std::ifstream wnsyntaxfile(wnpref + "wn_syntax.pl");
2259 if (!wnsyntaxfile.is_open())
2260 {
2261 std::cout << "Invalid WordNet data directory." << std::endl;
2262 print_usage();
2263 }
2264 1419
2265 std::list<std::string> lines; 1420 return grp;
2266 for (;;)
2267 {
2268 std::string line;
2269 if (!getline(wnsyntaxfile, line))
2270 {
2271 break;
2272 }
2273
2274 if (line.back() == '\r')
2275 {
2276 line.pop_back();
2277 }
2278
2279 lines.push_back(line);
2280 } 1421 }
2281 1422
2282 progress ppgs("Writing adjective syntax markers...", lines.size()); 1423 selrestr generator::parseSelrestr(xmlNodePtr top)
2283 for (auto line : lines)
2284 { 1424 {
2285 ppgs.update(); 1425 xmlChar* key;
2286 1426
2287 std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\."); 1427 if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
2288 std::smatch relation_data;
2289 if (!std::regex_search(line, relation_data, relation))
2290 {
2291 continue;
2292 }
2293
2294 int synset_id = stoi(relation_data[1]);
2295 int wnum = stoi(relation_data[2]);
2296 std::string syn = relation_data[3];
2297 std::string query("UPDATE adjectives SET position = ? WHERE adjective_id = ?");
2298
2299 sqlite3_stmt* ppstmt;
2300 if (sqlite3_prepare_v2(ppdb, query.c_str(), query.size(), &ppstmt, NULL) != SQLITE_OK)
2301 { 1428 {
2302 db_error(ppdb, query); 1429 if (xmlChildElementCount(top) == 0)
2303 } 1430 {
2304 1431 return {};
2305 sqlite3_bind_text(ppstmt, 1, syn.c_str(), 1, SQLITE_TRANSIENT); 1432 } else if (xmlChildElementCount(top) == 1)
2306 sqlite3_bind_int(ppstmt, 2, wn[synset_id][wnum]); 1433 {
2307 1434 return parseSelrestr(xmlFirstElementChild(top));
2308 if (sqlite3_step(ppstmt) != SQLITE_DONE) 1435 } else {
1436 bool orlogic = false;
1437 if (xmlHasProp(top, reinterpret_cast<const xmlChar*>("logic")))
1438 {
1439 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("logic"));
1440 if (!xmlStrcmp(key, reinterpret_cast<const xmlChar*>("or")))
1441 {
1442 orlogic = true;
1443 }
1444
1445 xmlFree(key);
1446 }
1447
1448 std::list<selrestr> children;
1449 for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next)
1450 {
1451 if (!xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTRS"))
1452 || !xmlStrcmp(selrestr->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
1453 {
1454 children.push_back(parseSelrestr(selrestr));
1455 }
1456 }
1457
1458 return selrestr(children, orlogic);
1459 }
1460 } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
2309 { 1461 {
2310 db_error(ppdb, query); 1462 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("Value"));
1463 bool selPos = (std::string(reinterpret_cast<const char*>(key)) == "+");
1464 xmlFree(key);
1465
1466 key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type"));
1467 std::string selRestriction = reinterpret_cast<const char*>(key);
1468 xmlFree(key);
1469
1470 return selrestr(selRestriction, selPos);
1471 } else {
1472 throw std::logic_error("Badly formatted selrestr");
2311 } 1473 }
2312
2313 sqlite3_finalize(ppstmt);
2314 } 1474 }
2315 } 1475
2316 1476 };
2317 sqlite3_close_v2(ppdb); 1477};
2318
2319 std::cout << "Done." << std::endl;
2320}
diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..e2a7404 --- /dev/null +++ b/generator/generator.h
@@ -0,0 +1,151 @@
1#ifndef GENERATOR_H_5B61CBC5
2#define GENERATOR_H_5B61CBC5
3
4#include <string>
5#include <map>
6#include <list>
7#include <set>
8#include <libxml/parser.h>
9#include "database.h"
10#include "notion.h"
11#include "word.h"
12#include "lemma.h"
13#include "form.h"
14#include "pronunciation.h"
15#include "group.h"
16#include "frame.h"
17
18namespace verbly {
19 namespace generator {
20
21 enum class part_of_speech;
22 class selrestr;
23
24 class generator {
25 public:
26
27 // Constructor
28
29 generator(
30 std::string verbNetPath,
31 std::string agidPath,
32 std::string wordNetPath,
33 std::string cmudictPath,
34 std::string imageNetPath,
35 std::string outputPath);
36
37 // Action
38
39 void run();
40
41 private:
42
43 // Subroutines
44
45 void readWordNetSynsets();
46
47 void readAdjectivePositioning();
48
49 void readImageNetUrls();
50
51 void readWordNetSenseKeys();
52
53 void readVerbNet();
54
55 void readAgidInflections();
56
57 void readPrepositions();
58
59 void readCmudictPronunciations();
60
61 void writeSchema();
62
63 void dumpObjects();
64
65 void readWordNetAntonymy();
66
67 void readWordNetVariation();
68
69 void readWordNetClasses();
70
71 void readWordNetCausality();
72
73 void readWordNetEntailment();
74
75 void readWordNetHypernymy();
76
77 void readWordNetInstantiation();
78
79 void readWordNetMemberMeronymy();
80
81 void readWordNetPartMeronymy();
82
83 void readWordNetSubstanceMeronymy();
84
85 void readWordNetPertainymy();
86
87 void readWordNetSpecification();
88
89 void readWordNetSimilarity();
90
91 // Helpers
92
93 std::list<std::string> readFile(std::string path);
94
95 inline part_of_speech partOfSpeechByWnid(int wnid);
96
97 notion& createNotion(part_of_speech partOfSpeech);
98
99 notion& lookupOrCreateNotion(int wnid);
100
101 lemma& lookupOrCreateLemma(std::string base_form);
102
103 form& lookupOrCreateForm(std::string text);
104
105 template <typename... Args> word& createWord(Args&&... args);
106
107 group& createGroup(xmlNodePtr top);
108
109 selrestr parseSelrestr(xmlNodePtr top);
110
111 // Input
112
113 std::string verbNetPath_;
114 std::string agidPath_;
115 std::string wordNetPath_;
116 std::string cmudictPath_;
117 std::string imageNetPath_;
118
119 // Output
120
121 database db_;
122
123 // Data
124
125 std::list<notion> notions_;
126 std::list<word> words_;
127 std::list<lemma> lemmas_;
128 std::list<form> forms_;
129 std::list<pronunciation> pronunciations_;
130 std::list<frame> frames_;
131 std::list<group> groups_;
132
133 // Indexes
134
135 std::map<int, notion*> notionByWnid_;
136 std::map<int, std::set<word*>> wordsByWnid_;
137 std::map<std::pair<int, int>, word*> wordByWnidAndWnum_;
138 std::map<std::string, std::set<word*>> wordsByBaseForm_;
139 std::map<std::string, lemma*> lemmaByBaseForm_;
140 std::map<std::string, form*> formByText_;
141
142 // Caches
143
144 std::map<std::string, word*> wnSenseKeys_;
145
146 };
147
148 };
149};
150
151#endif /* end of include guard: GENERATOR_H_5B61CBC5 */
diff --git a/generator/group.cpp b/generator/group.cpp new file mode 100644 index 0000000..7cbd4c8 --- /dev/null +++ b/generator/group.cpp
@@ -0,0 +1,119 @@
1#include "group.h"
2#include <stdexcept>
3#include <list>
4#include <json.hpp>
5#include "database.h"
6#include "field.h"
7#include "frame.h"
8
9namespace verbly {
10 namespace generator {
11
12 int group::nextId_ = 0;
13
14 group::group() : id_(nextId_++)
15 {
16 }
17
18 void group::setParent(const group& parent)
19 {
20 // Adding a group to itself is nonsensical.
21 assert(&parent != this);
22
23 parent_ = &parent;
24 }
25
26 void group::addRole(std::string name, role r)
27 {
28 roleNames_.insert(name);
29 roles_[name] = std::move(r);
30 }
31
32 void group::addFrame(const frame& f)
33 {
34 frames_.insert(&f);
35 }
36
37 std::set<std::string> group::getRoles() const
38 {
39 std::set<std::string> fullRoles = roleNames_;
40
41 if (hasParent())
42 {
43 for (std::string name : getParent().getRoles())
44 {
45 fullRoles.insert(name);
46 }
47 }
48
49 return fullRoles;
50 }
51
52 const role& group::getRole(std::string name) const
53 {
54 if (roles_.count(name))
55 {
56 return roles_.at(name);
57 } else if (hasParent())
58 {
59 return getParent().getRole(name);
60 } else {
61 throw std::invalid_argument("Specified role not found in verb group");
62 }
63 }
64
65 std::set<const frame*> group::getFrames() const
66 {
67 std::set<const frame*> fullFrames = frames_;
68
69 if (hasParent())
70 {
71 for (const frame* f : getParent().getFrames())
72 {
73 fullFrames.insert(f);
74 }
75 }
76
77 return fullFrames;
78 }
79
80 database& operator<<(database& db, const group& arg)
81 {
82 // Serialize the group first
83 {
84 std::list<field> fields;
85 fields.emplace_back("group_id", arg.getId());
86
87 nlohmann::json jsonRoles;
88 for (std::string name : arg.getRoles())
89 {
90 const role& r = arg.getRole(name);
91
92 nlohmann::json jsonRole;
93 jsonRole["type"] = name;
94 jsonRole["selrestrs"] = r.getSelrestrs().toJson();
95
96 jsonRoles.emplace_back(std::move(jsonRole));
97 }
98
99 fields.emplace_back("data", jsonRoles.dump());
100
101 db.insertIntoTable("groups", std::move(fields));
102 }
103
104 // Then, serialize the group/frame relationship
105 for (const frame* f : arg.getFrames())
106 {
107 std::list<field> fields;
108
109 fields.emplace_back("group_id", arg.getId());
110 fields.emplace_back("frame_id", f->getId());
111
112 db.insertIntoTable("groups_frames", std::move(fields));
113 }
114
115 return db;
116 }
117
118 };
119};
diff --git a/generator/group.h b/generator/group.h new file mode 100644 index 0000000..efb8c5d --- /dev/null +++ b/generator/group.h
@@ -0,0 +1,80 @@
1#ifndef GROUP_H_EDAFB5DC
2#define GROUP_H_EDAFB5DC
3
4#include <map>
5#include <set>
6#include <string>
7#include <cassert>
8#include "role.h"
9
10namespace verbly {
11 namespace generator {
12
13 class frame;
14 class database;
15
16 class group {
17 public:
18
19 // Constructor
20
21 group();
22
23 // Mutators
24
25 void setParent(const group& parent);
26
27 void addRole(std::string name, role r);
28
29 void addFrame(const frame& f);
30
31 // Accessors
32
33 int getId() const
34 {
35 return id_;
36 }
37
38 bool hasParent() const
39 {
40 return (parent_ != nullptr);
41 }
42
43 const group& getParent() const
44 {
45 // Calling code should always call hasParent first
46 assert(parent_ != nullptr);
47
48 return *parent_;
49 }
50
51 std::set<std::string> getRoles() const;
52
53 const role& getRole(std::string name) const;
54
55 std::set<const frame*> getFrames() const;
56
57 private:
58
59 static int nextId_;
60
61 const int id_;
62
63 const group* parent_ = nullptr;
64 std::map<std::string, role> roles_;
65 std::set<const frame*> frames_;
66
67 // Caches
68
69 std::set<std::string> roleNames_;
70
71 };
72
73 // Serializer
74
75 database& operator<<(database& db, const group& arg);
76
77 };
78};
79
80#endif /* end of include guard: GROUP_H_EDAFB5DC */
diff --git a/generator/lemma.cpp b/generator/lemma.cpp new file mode 100644 index 0000000..e66b153 --- /dev/null +++ b/generator/lemma.cpp
@@ -0,0 +1,65 @@
1#include "lemma.h"
2#include <list>
3#include <cassert>
4#include "field.h"
5#include "database.h"
6#include "form.h"
7
8namespace verbly {
9 namespace generator {
10
11 int lemma::nextId_ = 0;
12
13 lemma::lemma(const form& baseForm) :
14 id_(nextId_++),
15 baseForm_(baseForm)
16 {
17 inflections_[inflection::base] = {&baseForm};
18 }
19
20 void lemma::addInflection(inflection type, const form& f)
21 {
22 // There can only be one base form.
23 assert(type != inflection::base);
24
25 inflections_[type].insert(&f);
26 }
27
28 std::set<const form*> lemma::getInflections(inflection type) const
29 {
30 if (inflections_.count(type))
31 {
32 return inflections_.at(type);
33 } else {
34 return {};
35 }
36 }
37
38 database& operator<<(database& db, const lemma& arg)
39 {
40 for (inflection type : {
41 inflection::base,
42 inflection::plural,
43 inflection::comparative,
44 inflection::superlative,
45 inflection::past_tense,
46 inflection::past_participle,
47 inflection::ing_form,
48 inflection::s_form})
49 {
50 for (const form* f : arg.getInflections(type))
51 {
52 std::list<field> fields;
53 fields.emplace_back("lemma_id", arg.getId());
54 fields.emplace_back("form_id", f->getId());
55 fields.emplace_back("category", static_cast<int>(type));
56
57 db.insertIntoTable("lemmas_forms", std::move(fields));
58 }
59 }
60
61 return db;
62 }
63
64 };
65};
diff --git a/generator/lemma.h b/generator/lemma.h new file mode 100644 index 0000000..6452e08 --- /dev/null +++ b/generator/lemma.h
@@ -0,0 +1,58 @@
1#ifndef LEMMA_H_D73105A7
2#define LEMMA_H_D73105A7
3
4#include <string>
5#include <map>
6#include <set>
7#include "enums.h"
8
9namespace verbly {
10 namespace generator {
11
12 class database;
13 class form;
14
15 class lemma {
16 public:
17
18 // Constructors
19
20 explicit lemma(const form& baseForm);
21
22 // Mutators
23
24 void addInflection(inflection type, const form& f);
25
26 // Accessors
27
28 int getId() const
29 {
30 return id_;
31 }
32
33 const form& getBaseForm() const
34 {
35 return baseForm_;
36 }
37
38 std::set<const form*> getInflections(inflection type) const;
39
40 private:
41
42 static int nextId_;
43
44 const int id_;
45 const form& baseForm_;
46
47 std::map<inflection, std::set<const form*>> inflections_;
48
49 };
50
51 // Serializer
52
53 database& operator<<(database& db, const lemma& arg);
54
55 };
56};
57
58#endif /* end of include guard: LEMMA_H_D73105A7 */
diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..827c963 --- /dev/null +++ b/generator/main.cpp
@@ -0,0 +1,40 @@
1#include <iostream>
2#include <exception>
3#include "generator.h"
4
5void printUsage()
6{
7 std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output" << std::endl;
8 std::cout << "verbnet :: path to a VerbNet data directory" << std::endl;
9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl;
10 std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl;
11 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl;
12 std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl;
13 std::cout << "output :: datafile output path" << std::endl;
14}
15
16int main(int argc, char** argv)
17{
18 if (argc == 7)
19 {
20 try
21 {
22 verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
23
24 try
25 {
26 app.run();
27 } catch (const std::exception& e)
28 {
29 std::cout << e.what() << std::endl;
30 }
31 } catch (const std::exception& e)
32 {
33 std::cout << e.what() << std::endl;
34 printUsage();
35 }
36 } else {
37 std::cout << "verbly datafile generator" << std::endl;
38 printUsage();
39 }
40}
diff --git a/generator/notion.cpp b/generator/notion.cpp new file mode 100644 index 0000000..290d982 --- /dev/null +++ b/generator/notion.cpp
@@ -0,0 +1,85 @@
1#include "notion.h"
2#include <string>
3#include <list>
4#include "database.h"
5#include "field.h"
6
7namespace verbly {
8 namespace generator {
9
10 int notion::nextId_ = 0;
11
12 notion::notion(
13 part_of_speech partOfSpeech) :
14 id_(nextId_++),
15 partOfSpeech_(partOfSpeech)
16 {
17 }
18
19 notion::notion(
20 part_of_speech partOfSpeech,
21 int wnid) :
22 id_(nextId_++),
23 partOfSpeech_(partOfSpeech),
24 wnid_(wnid),
25 hasWnid_(true)
26 {
27 }
28
29 void notion::incrementNumOfImages()
30 {
31 // Calling code should always call hasWnid and check that the notion is a noun first.
32 assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun));
33
34 numOfImages_++;
35 }
36
37 void notion::setPrepositionGroups(std::list<std::string> groups)
38 {
39 // Calling code should always check that the notion is a preposition first.
40 assert(partOfSpeech_ == part_of_speech::preposition);
41
42 prepositionGroups_ = groups;
43 }
44
45 database& operator<<(database& db, const notion& arg)
46 {
47 // First, serialize the notion
48 {
49 std::list<field> fields;
50
51 fields.emplace_back("notion_id", arg.getId());
52 fields.emplace_back("part_of_speech", static_cast<int>(arg.getPartOfSpeech()));
53
54 if (arg.hasWnid())
55 {
56 fields.emplace_back("wnid", arg.getWnid());
57
58 if (arg.getPartOfSpeech() == part_of_speech::noun)
59 {
60 fields.emplace_back("images", arg.getNumOfImages());
61 }
62 }
63
64 db.insertIntoTable("notions", std::move(fields));
65 }
66
67 // Next, serialize the is_a relationship if this is a preposition
68 if (arg.getPartOfSpeech() == part_of_speech::preposition)
69 {
70 for (std::string group : arg.getPrepositionGroups())
71 {
72 std::list<field> fields;
73
74 fields.emplace_back("notion_id", arg.getId());
75 fields.emplace_back("groupname", group);
76
77 db.insertIntoTable("is_a", std::move(fields));
78 }
79 }
80
81 return db;
82 }
83
84 };
85};
diff --git a/generator/notion.h b/generator/notion.h new file mode 100644 index 0000000..76210de --- /dev/null +++ b/generator/notion.h
@@ -0,0 +1,91 @@
1#ifndef NOTION_H_221DE2BC
2#define NOTION_H_221DE2BC
3
4#include <cassert>
5#include <list>
6#include <string>
7#include "enums.h"
8
9namespace verbly {
10 namespace generator {
11
12 class database;
13
14 class notion {
15 public:
16
17 // Constructors
18
19 explicit notion(part_of_speech partOfSpeech);
20
21 notion(part_of_speech partOfSpeech, int wnid);
22
23 // Mutators
24
25 void incrementNumOfImages();
26
27 void setPrepositionGroups(std::list<std::string> groups);
28
29 // Accessors
30
31 int getId() const
32 {
33 return id_;
34 }
35
36 part_of_speech getPartOfSpeech() const
37 {
38 return partOfSpeech_;
39 }
40
41 bool hasWnid() const
42 {
43 return hasWnid_;
44 }
45
46 int getWnid() const
47 {
48 // Calling code should always call hasWnid first.
49 assert(hasWnid_);
50
51 return wnid_;
52 }
53
54 int getNumOfImages() const
55 {
56 // Calling code should always call hasWnid and check that the notion is a noun first.
57 assert(hasWnid_ && (partOfSpeech_ == part_of_speech::noun));
58
59 return numOfImages_;
60 }
61
62 std::list<std::string> getPrepositionGroups() const
63 {
64 // Calling code should always check that the notion is a preposition first.
65 assert(partOfSpeech_ == part_of_speech::preposition);
66
67 return prepositionGroups_;
68 }
69
70 private:
71
72 static int nextId_;
73
74 const int id_;
75 const part_of_speech partOfSpeech_;
76 const int wnid_ = 0;
77 const bool hasWnid_ = false;
78
79 int numOfImages_ = 0;
80 std::list<std::string> prepositionGroups_;
81
82 };
83
84 // Serializer
85
86 database& operator<<(database& db, const notion& arg);
87
88 };
89};
90
91#endif /* end of include guard: NOTION_H_221DE2BC */
diff --git a/generator/part.cpp b/generator/part.cpp new file mode 100644 index 0000000..dbd4e11 --- /dev/null +++ b/generator/part.cpp
@@ -0,0 +1,336 @@
1#include "part.h"
2#include <stdexcept>
3#include "selrestr.h"
4
5namespace verbly {
6 namespace generator {
7
8 part part::createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs)
9 {
10 part p(type::noun_phrase);
11
12 new(&p.noun_phrase_.role) std::string(std::move(role));
13 new(&p.noun_phrase_.selrestrs) selrestr(std::move(selrestrs));
14 new(&p.noun_phrase_.synrestrs) std::set<std::string>(std::move(synrestrs));
15
16 return p;
17 }
18
19 part part::createVerb()
20 {
21 return part(type::verb);
22 }
23
24 part part::createPreposition(std::set<std::string> choices, bool literal)
25 {
26 part p(type::preposition);
27
28 new(&p.preposition_.choices) std::set<std::string>(std::move(choices));
29 p.preposition_.literal = literal;
30
31 return p;
32 }
33
34 part part::createAdjective()
35 {
36 return part(type::adjective);
37 }
38
39 part part::createAdverb()
40 {
41 return part(type::adverb);
42 }
43
44 part part::createLiteral(std::string value)
45 {
46 part p(type::literal);
47
48 new(&p.literal_) std::string(std::move(value));
49
50 return p;
51 }
52
53 part::part(const part& other)
54 {
55 type_ = other.type_;
56
57 switch (type_)
58 {
59 case type::noun_phrase:
60 {
61 new(&noun_phrase_.role) std::string(other.noun_phrase_.role);
62 new(&noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs);
63 new(&noun_phrase_.synrestrs) std::set<std::string>(other.noun_phrase_.synrestrs);
64
65 break;
66 }
67
68 case type::preposition:
69 {
70 new(&preposition_.choices) std::set<std::string>(other.preposition_.choices);
71 preposition_.literal = other.preposition_.literal;
72
73 break;
74 }
75
76 case type::literal:
77 {
78 new(&literal_) std::string(other.literal_);
79
80 break;
81 }
82
83 case type::verb:
84 case type::adjective:
85 case type::adverb:
86 case type::invalid:
87 {
88 break;
89 }
90 }
91 }
92
93 part::part(part&& other) : part()
94 {
95 swap(*this, other);
96 }
97
98 part& part::operator=(part other)
99 {
100 swap(*this, other);
101
102 return *this;
103 }
104
105 void swap(part& first, part& second)
106 {
107 using type = part::type;
108
109 type tempType = first.type_;
110 std::string tempRole;
111 selrestr tempSelrestrs;
112 std::set<std::string> tempSynrestrs;
113 std::set<std::string> tempChoices;
114 bool tempPrepLiteral;
115 std::string tempLiteralValue;
116
117 switch (tempType)
118 {
119 case type::noun_phrase:
120 {
121 tempRole = std::move(first.noun_phrase_.role);
122 tempSelrestrs = std::move(first.noun_phrase_.selrestrs);
123 tempSynrestrs = std::move(first.noun_phrase_.synrestrs);
124
125 break;
126 }
127
128 case type::preposition:
129 {
130 tempChoices = std::move(first.preposition_.choices);
131 tempPrepLiteral = first.preposition_.literal;
132
133 break;
134 }
135
136 case type::literal:
137 {
138 tempLiteralValue = std::move(first.literal_);
139
140 break;
141 }
142
143 case type::verb:
144 case type::adjective:
145 case type::adverb:
146 case type::invalid:
147 {
148 break;
149 }
150 }
151
152 first.~part();
153
154 first.type_ = second.type_;
155
156 switch (first.type_)
157 {
158 case type::noun_phrase:
159 {
160 new(&first.noun_phrase_.role) std::string(std::move(second.noun_phrase_.role));
161 new(&first.noun_phrase_.selrestrs) selrestr(std::move(second.noun_phrase_.selrestrs));
162 new(&first.noun_phrase_.synrestrs) std::set<std::string>(std::move(second.noun_phrase_.synrestrs));
163
164 break;
165 }
166
167 case type::preposition:
168 {
169 new(&first.preposition_.choices) std::set<std::string>(std::move(second.preposition_.choices));
170 first.preposition_.literal = second.preposition_.literal;
171
172 break;
173 }
174
175 case type::literal:
176 {
177 new(&first.literal_) std::string(std::move(second.literal_));
178
179 break;
180 }
181
182 case type::verb:
183 case type::adjective:
184 case type::adverb:
185 case type::invalid:
186 {
187 break;
188 }
189 }
190
191 second.~part();
192
193 second.type_ = tempType;
194
195 switch (second.type_)
196 {
197 case type::noun_phrase:
198 {
199 new(&second.noun_phrase_.role) std::string(std::move(tempRole));
200 new(&second.noun_phrase_.selrestrs) selrestr(std::move(tempSelrestrs));
201 new(&second.noun_phrase_.synrestrs) std::set<std::string>(std::move(tempSynrestrs));
202
203 break;
204 }
205
206 case type::preposition:
207 {
208 new(&second.preposition_.choices) std::set<std::string>(std::move(tempChoices));
209 second.preposition_.literal = tempPrepLiteral;
210
211 break;
212 }
213
214 case type::literal:
215 {
216 new(&second.literal_) std::string(std::move(tempLiteralValue));
217
218 break;
219 }
220
221 case type::verb:
222 case type::adjective:
223 case type::adverb:
224 case type::invalid:
225 {
226 break;
227 }
228 }
229 }
230
231 part::~part()
232 {
233 switch (type_)
234 {
235 case type::noun_phrase:
236 {
237 using string_type = std::string;
238 using set_type = std::set<std::string>;
239
240 noun_phrase_.role.~string_type();
241 noun_phrase_.selrestrs.~selrestr();
242 noun_phrase_.synrestrs.~set_type();
243
244 break;
245 }
246
247 case type::preposition:
248 {
249 using set_type = std::set<std::string>;
250
251 preposition_.choices.~set_type();
252
253 break;
254 }
255
256 case type::literal:
257 {
258 using string_type = std::string;
259
260 literal_.~string_type();
261
262 break;
263 }
264
265 case type::verb:
266 case type::adjective:
267 case type::adverb:
268 case type::invalid:
269 {
270 break;
271 }
272 }
273 }
274
275 std::string part::getNounRole() const
276 {
277 if (type_ == type::noun_phrase)
278 {
279 return noun_phrase_.role;
280 } else {
281 throw std::domain_error("part::getNounRole is only valid for noun phrase parts");
282 }
283 }
284
285 selrestr part::getNounSelrestrs() const
286 {
287 if (type_ == type::noun_phrase)
288 {
289 return noun_phrase_.selrestrs;
290 } else {
291 throw std::domain_error("part::getNounSelrestrs is only valid for noun phrase parts");
292 }
293 }
294
295 std::set<std::string> part::getNounSynrestrs() const
296 {
297 if (type_ == type::noun_phrase)
298 {
299 return noun_phrase_.synrestrs;
300 } else {
301 throw std::domain_error("part::getNounSynrestrs is only valid for noun phrase parts");
302 }
303 }
304
305 std::set<std::string> part::getPrepositionChoices() const
306 {
307 if (type_ == type::preposition)
308 {
309 return preposition_.choices;
310 } else {
311 throw std::domain_error("part::getPrepositionChoices is only valid for preposition parts");
312 }
313 }
314
315 bool part::isPrepositionLiteral() const
316 {
317 if (type_ == type::preposition)
318 {
319 return preposition_.literal;
320 } else {
321 throw std::domain_error("part::isPrepositionLiteral is only valid for preposition parts");
322 }
323 }
324
325 std::string part::getLiteralValue() const
326 {
327 if (type_ == type::literal)
328 {
329 return literal_;
330 } else {
331 throw std::domain_error("part::getLiteralValue is only valid for literal parts");
332 }
333 }
334
335 };
336};
diff --git a/generator/part.h b/generator/part.h new file mode 100644 index 0000000..d044630 --- /dev/null +++ b/generator/part.h
@@ -0,0 +1,114 @@
1#ifndef PART_H_FB54F361
2#define PART_H_FB54F361
3
4#include <string>
5#include <set>
6#include "selrestr.h"
7
8namespace verbly {
9 namespace generator {
10
11 class part {
12 public:
13 enum class type {
14 invalid = -1,
15 noun_phrase = 0,
16 verb = 1,
17 preposition = 2,
18 adjective = 3,
19 adverb = 4,
20 literal = 5
21 };
22
23 // Static factories
24
25 static part createNounPhrase(std::string role, selrestr selrestrs, std::set<std::string> synrestrs);
26
27 static part createVerb();
28
29 static part createPreposition(std::set<std::string> choices, bool literal);
30
31 static part createAdjective();
32
33 static part createAdverb();
34
35 static part createLiteral(std::string value);
36
37 // Copy and move constructors
38
39 part(const part& other);
40
41 part(part&& other);
42
43 // Assignment
44
45 part& operator=(part other);
46
47 // Swap
48
49 friend void swap(part& first, part& second);
50
51 // Destructor
52
53 ~part();
54
55 // General accessors
56
57 type getType() const
58 {
59 return type_;
60 }
61
62 // Noun phrase accessors
63
64 std::string getNounRole() const;
65
66 selrestr getNounSelrestrs() const;
67
68 std::set<std::string> getNounSynrestrs() const;
69
70 // Preposition accessors
71
72 std::set<std::string> getPrepositionChoices() const;
73
74 bool isPrepositionLiteral() const;
75
76 // Literal accessors
77
78 std::string getLiteralValue() const;
79
80 private:
81
82 // Private constructors
83
84 part()
85 {
86 }
87
88 part(type t) : type_(t)
89 {
90 }
91
92 // Data
93
94 union {
95 struct {
96 std::string role;
97 selrestr selrestrs;
98 std::set<std::string> synrestrs;
99 } noun_phrase_;
100 struct {
101 std::set<std::string> choices;
102 bool literal;
103 } preposition_;
104 std::string literal_;
105 };
106
107 type type_ = type::invalid;
108
109 };
110
111 };
112};
113
114#endif /* end of include guard: PART_H_FB54F361 */
diff --git a/generator/progress.h b/generator/progress.h index 81f07a3..fcb680d 100644 --- a/generator/progress.h +++ b/generator/progress.h
@@ -3,48 +3,54 @@
3 3
4#include <string> 4#include <string>
5 5
6class progress { 6namespace verbly {
7 private: 7 namespace generator {
8 std::string message;
9 int total;
10 int cur = 0;
11 int lprint = 0;
12 8
13 public: 9 class progress {
14 progress(std::string message, int total) : message(message), total(total) 10 private:
15 { 11 std::string message;
16 std::cout << message << " 0%" << std::flush; 12 int total;
17 } 13 int cur = 0;
14 int lprint = 0;
18 15
19 void update(int val) 16 public:
20 { 17 progress(std::string message, int total) : message(message), total(total)
21 if (val <= total) 18 {
22 { 19 std::cout << message << " 0%" << std::flush;
23 cur = val; 20 }
24 } else { 21
25 cur = total; 22 void update(int val)
26 } 23 {
24 if (val <= total)
25 {
26 cur = val;
27 } else {
28 cur = total;
29 }
27 30
28 int pp = cur * 100 / total; 31 int pp = cur * 100 / total;
29 if (pp != lprint) 32 if (pp != lprint)
30 { 33 {
31 lprint = pp; 34 lprint = pp;
32 35
33 std::cout << "\b\b\b\b" << std::right; 36 std::cout << "\b\b\b\b" << std::right;
34 std::cout.width(3); 37 std::cout.width(3);
35 std::cout << pp << "%" << std::flush; 38 std::cout << pp << "%" << std::flush;
36 } 39 }
37 } 40 }
41
42 void update()
43 {
44 update(cur+1);
45 }
38 46
39 void update() 47 ~progress()
40 { 48 {
41 update(cur+1); 49 std::cout << "\b\b\b\b100%" << std::endl;
42 } 50 }
51 };
43 52
44 ~progress() 53 };
45 {
46 std::cout << "\b\b\b\b100%" << std::endl;
47 }
48}; 54};
49 55
50#endif /* end of include guard: PROGRESS_H_A34EF856 */ 56#endif /* end of include guard: PROGRESS_H_A34EF856 */
diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp new file mode 100644 index 0000000..eb07607 --- /dev/null +++ b/generator/pronunciation.cpp
@@ -0,0 +1,87 @@
1#include "pronunciation.h"
2#include <list>
3#include <algorithm>
4#include <cctype>
5#include <iterator>
6#include "database.h"
7#include "field.h"
8#include "../lib/util.h"
9
10namespace verbly {
11 namespace generator {
12
13 int pronunciation::nextId_ = 0;
14
15 pronunciation::pronunciation(std::string phonemes) :
16 id_(nextId_++),
17 phonemes_(phonemes)
18 {
19 auto phonemeList = split<std::list<std::string>>(phonemes, " ");
20
21 auto rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [] (std::string phoneme) {
22 return phoneme.find("1") != std::string::npos;
23 });
24
25 // Rhyme detection
26 if (rhymeStart != std::end(phonemeList))
27 {
28 std::list<std::string> rhymePhonemes;
29
30 std::transform(rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [] (std::string phoneme) {
31 std::string naked;
32
33 std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [] (char ch) {
34 return std::isdigit(ch);
35 });
36
37 return naked;
38 });
39
40 rhyme_ = implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " ");
41
42 if (rhymeStart != std::begin(phonemeList))
43 {
44 prerhyme_ = *std::prev(rhymeStart);
45 }
46 }
47
48 // Syllable/stress
49 for (std::string phoneme : phonemeList)
50 {
51 if (std::isdigit(phoneme.back()))
52 {
53 // It's a vowel!
54 syllables_++;
55
56 if (phoneme.back() == '1')
57 {
58 stress_.push_back('1');
59 } else {
60 stress_.push_back('0');
61 }
62 }
63 }
64 }
65
66 database& operator<<(database& db, const pronunciation& arg)
67 {
68 std::list<field> fields;
69
70 fields.emplace_back("pronunciation_id", arg.getId());
71 fields.emplace_back("phonemes", arg.getPhonemes());
72 fields.emplace_back("syllables", arg.getSyllables());
73 fields.emplace_back("stress", arg.getStress());
74
75 if (arg.hasRhyme())
76 {
77 fields.emplace_back("rhyme", arg.getRhymePhonemes());
78 fields.emplace_back("prerhyme", arg.getPrerhyme());
79 }
80
81 db.insertIntoTable("pronunciations", std::move(fields));
82
83 return db;
84 }
85
86 };
87};
diff --git a/generator/pronunciation.h b/generator/pronunciation.h new file mode 100644 index 0000000..81be6c4 --- /dev/null +++ b/generator/pronunciation.h
@@ -0,0 +1,82 @@
1#ifndef PRONUNCIATION_H_584A08DD
2#define PRONUNCIATION_H_584A08DD
3
4#include <string>
5#include <cassert>
6
7namespace verbly {
8 namespace generator {
9
10 class database;
11
12 class pronunciation {
13 public:
14
15 // Constructor
16
17 explicit pronunciation(std::string phonemes);
18
19 // Accessors
20
21 int getId() const
22 {
23 return id_;
24 }
25
26 std::string getPhonemes() const
27 {
28 return phonemes_;
29 }
30
31 bool hasRhyme() const
32 {
33 return !rhyme_.empty();
34 }
35
36 std::string getRhymePhonemes() const
37 {
38 // Calling code should always call hasRhyme first.
39 assert(!rhyme_.empty());
40
41 return rhyme_;
42 }
43
44 std::string getPrerhyme() const
45 {
46 // Calling code should always call hasRhyme first.
47 assert(!rhyme_.empty());
48
49 return prerhyme_;
50 }
51
52 int getSyllables() const
53 {
54 return syllables_;
55 }
56
57 std::string getStress() const
58 {
59 return stress_;
60 }
61
62 private:
63
64 static int nextId_;
65
66 const int id_;
67 const std::string phonemes_;
68 std::string rhyme_;
69 std::string prerhyme_;
70 int syllables_ = 0;
71 std::string stress_;
72
73 };
74
75 // Serializer
76
77 database& operator<<(database& db, const pronunciation& arg);
78
79 };
80};
81
82#endif /* end of include guard: PRONUNCIATION_H_584A08DD */
diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..5fa68b8 --- /dev/null +++ b/generator/role.h
@@ -0,0 +1,35 @@
1#ifndef ROLE_H_249F9A9C
2#define ROLE_H_249F9A9C
3
4#include "selrestr.h"
5
6namespace verbly {
7 namespace generator {
8
9 class role {
10 public:
11
12 // Mutators
13
14 void setSelrestrs(selrestr selrestrs)
15 {
16 selrestrs_ = selrestrs;
17 }
18
19 // Accessors
20
21 const selrestr& getSelrestrs() const
22 {
23 return selrestrs_;
24 }
25
26 private:
27
28 selrestr selrestrs_;
29
30 };
31
32 };
33};
34
35#endif /* end of include guard: ROLE_H_249F9A9C */
diff --git a/generator/schema.sql b/generator/schema.sql index 410b536..c3e54d8 100644 --- a/generator/schema.sql +++ b/generator/schema.sql
@@ -1,286 +1,204 @@
1DROP TABLE IF EXISTS `verbs`; 1CREATE TABLE `notions` (
2CREATE TABLE `verbs` ( 2 `notion_id` INTEGER PRIMARY KEY,
3 `verb_id` INTEGER PRIMARY KEY, 3 `part_of_speech` SMALLINT NOT NULL,
4 `infinitive` VARCHAR(32) NOT NULL, 4 `wnid` INTEGER,
5 `past_tense` VARCHAR(32) NOT NULL, 5 `images` INTEGER
6 `past_participle` VARCHAR(32) NOT NULL,
7 `ing_form` VARCHAR(32) NOT NULL,
8 `s_form` VARCHAR(32) NOT NULL
9); 6);
10 7
11DROP TABLE IF EXISTS `groups`; 8CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`);
12CREATE TABLE `groups` (
13 `group_id` INTEGER PRIMARY KEY,
14 `data` BLOB NOT NULL
15);
16
17DROP TABLE IF EXISTS `frames`;
18CREATE TABLE `frames` (
19 `frame_id` INTEGER PRIMARY KEY,
20 `group_id` INTEGER NOT NULL,
21 `data` BLOB NOT NULL,
22 FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`)
23);
24 9
25DROP TABLE IF EXISTS `verb_groups`;
26CREATE TABLE `verb_groups` (
27 `verb_id` INTEGER NOT NULL,
28 `group_id` INTEGER NOT NULL,
29 FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`),
30 FOREIGN KEY (`group_id`) REFERENCES `groups`(`group_id`)
31);
32
33DROP TABLE IF EXISTS `adjectives`;
34CREATE TABLE `adjectives` (
35 `adjective_id` INTEGER PRIMARY KEY,
36 `base_form` VARCHAR(32) NOT NULL,
37 `comparative` VARCHAR(32),
38 `superlative` VARCHAR(32),
39 `position` CHAR(1),
40 `complexity` INTEGER NOT NULL
41);
42
43DROP TABLE IF EXISTS `adverbs`;
44CREATE TABLE `adverbs` (
45 `adverb_id` INTEGER PRIMARY KEY,
46 `base_form` VARCHAR(32) NOT NULL,
47 `comparative` VARCHAR(32),
48 `superlative` VARCHAR(32),
49 `complexity` INTEGER NOT NULL
50);
51
52DROP TABLE IF EXISTS `nouns`;
53CREATE TABLE `nouns` (
54 `noun_id` INTEGER PRIMARY KEY,
55 `singular` VARCHAR(32) NOT NULL,
56 `plural` VARCHAR(32),
57 `proper` INTEGER(1) NOT NULL,
58 `complexity` INTEGER NOT NULL,
59 `images` INTEGER NOT NULL,
60 `wnid` INTEGER NOT NULL
61);
62
63DROP TABLE IF EXISTS `hypernymy`;
64CREATE TABLE `hypernymy` ( 10CREATE TABLE `hypernymy` (
65 `hypernym_id` INTEGER NOT NULL, 11 `hypernym_id` INTEGER NOT NULL,
66 `hyponym_id` INTEGER NOT NULL, 12 `hyponym_id` INTEGER NOT NULL
67 FOREIGN KEY (`hypernym_id`) REFERENCES `nouns`(`noun_id`),
68 FOREIGN KEY (`hyponym_id`) REFERENCES `nouns`(`noun_id`)
69); 13);
70 14
71DROP TABLE IF EXISTS `instantiation`; 15CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`);
16CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`);
17
72CREATE TABLE `instantiation` ( 18CREATE TABLE `instantiation` (
73 `class_id` INTEGER NOT NULL, 19 `class_id` INTEGER NOT NULL,
74 `instance_id` INTEGER NOT NULL, 20 `instance_id` INTEGER NOT NULL
75 FOREIGN KEY (`class_id`) REFERENCES `nouns`(`noun_id`),
76 FOREIGN KEY (`instance_id`) REFERENCES `nouns`(`noun_id`)
77); 21);
78 22
79DROP TABLE IF EXISTS `member_meronymy`; 23CREATE INDEX `instance_of` ON `instantiation`(`class_id`);
24CREATE INDEX `class_of` ON `instantiation`(`instance_id`);
25
80CREATE TABLE `member_meronymy` ( 26CREATE TABLE `member_meronymy` (
81 `meronym_id` INTEGER NOT NULL, 27 `meronym_id` INTEGER NOT NULL,
82 `holonym_id` INTEGER NOT NULL, 28 `holonym_id` INTEGER NOT NULL
83 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
84 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
85); 29);
86 30
87DROP TABLE IF EXISTS `part_meronymy`; 31CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`);
32CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`);
33
88CREATE TABLE `part_meronymy` ( 34CREATE TABLE `part_meronymy` (
89 `meronym_id` INTEGER NOT NULL, 35 `meronym_id` INTEGER NOT NULL,
90 `holonym_id` INTEGER NOT NULL, 36 `holonym_id` INTEGER NOT NULL
91 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
92 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
93); 37);
94 38
95DROP TABLE IF EXISTS `substance_meronymy`; 39CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`);
40CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`);
41
96CREATE TABLE `substance_meronymy` ( 42CREATE TABLE `substance_meronymy` (
97 `meronym_id` INTEGER NOT NULL, 43 `meronym_id` INTEGER NOT NULL,
98 `holonym_id` INTEGER NOT NULL, 44 `holonym_id` INTEGER NOT NULL
99 FOREIGN KEY (`meronym_id`) REFERENCES `nouns`(`noun_id`),
100 FOREIGN KEY (`holonym_id`) REFERENCES `nouns`(`noun_id`)
101); 45);
102 46
103DROP TABLE IF EXISTS `variation`; 47CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`);
48CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`);
49
104CREATE TABLE `variation` ( 50CREATE TABLE `variation` (
105 `noun_id` INTEGER NOT NULL, 51 `noun_id` INTEGER NOT NULL,
106 `adjective_id` INTEGER NOT NULL, 52 `adjective_id` INTEGER NOT NULL
107 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
108 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
109); 53);
110 54
111DROP TABLE IF EXISTS `noun_antonymy`; 55CREATE INDEX `variant_of` ON `variation`(`noun_id`);
112CREATE TABLE `noun_antonymy` ( 56CREATE INDEX `attribute_of` ON `variation`(`adjective_id`);
113 `noun_1_id` INTEGER NOT NULL,
114 `noun_2_id` INTEGER NOT NULL,
115 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`),
116 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`)
117);
118 57
119DROP TABLE IF EXISTS `adjective_antonymy`; 58CREATE TABLE `similarity` (
120CREATE TABLE `adjective_antonymy` (
121 `adjective_1_id` INTEGER NOT NULL, 59 `adjective_1_id` INTEGER NOT NULL,
122 `adjective_2_id` INTEGER NOT NULL, 60 `adjective_2_id` INTEGER NOT NULL
123 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), 61);
124 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) 62
63CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`);
64
65CREATE TABLE `is_a` (
66 `notion_id` INTEGER NOT NULL,
67 `groupname` VARCHAR(32) NOT NULL
125); 68);
126 69
127DROP TABLE IF EXISTS `adverb_antonymy`; 70CREATE TABLE `entailment` (
128CREATE TABLE `adverb_antonymy` ( 71 `given_id` INTEGER NOT NULL,
129 `adverb_1_id` INTEGER NOT NULL, 72 `entailment_id` INTEGER NOT NULL
130 `adverb_2_id` INTEGER NOT NULL, 73);
131 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), 74
132 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) 75CREATE INDEX `entailment_of` ON `entailment`(`given_id`);
76CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`);
77
78CREATE TABLE `causality` (
79 `cause_id` INTEGER NOT NULL,
80 `effect_id` INTEGER NOT NULL
81);
82
83CREATE INDEX `effect_of` ON `causality`(`cause_id`);
84CREATE INDEX `cause_of` ON `causality`(`effect_id`);
85
86CREATE TABLE `words` (
87 `word_id` INTEGER PRIMARY KEY,
88 `notion_id` INTEGER NOT NULL,
89 `lemma_id` INTEGER NOT NULL,
90 `tag_count` INTEGER,
91 `position` SMALLINT,
92 `group_id` INTEGER
93);
94
95CREATE INDEX `notion_words` ON `words`(`notion_id`);
96CREATE INDEX `lemma_words` ON `words`(`lemma_id`);
97CREATE INDEX `group_words` ON `words`(`group_id`);
98
99CREATE TABLE `antonymy` (
100 `antonym_1_id` INTEGER NOT NULL,
101 `antonym_2_id` INTEGER NOT NULL
133); 102);
134 103
135DROP TABLE IF EXISTS `specification`; 104CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`);
105
136CREATE TABLE `specification` ( 106CREATE TABLE `specification` (
137 `general_id` INTEGER NOT NULL, 107 `general_id` INTEGER NOT NULL,
138 `specific_id` INTEGER NOT NULL, 108 `specific_id` INTEGER NOT NULL
139 FOREIGN KEY (`general_id`) REFERENCES `adjectives`(`adjective_id`),
140 FOREIGN KEY (`specific_id`) REFERENCES `adjectives`(`adjective_id`)
141); 109);
142 110
143DROP TABLE IF EXISTS `pertainymy`; 111CREATE INDEX `specification_of` ON `specification`(`general_id`);
112CREATE INDEX `generalization_of` ON `specification`(`specific_id`);
113
144CREATE TABLE `pertainymy` ( 114CREATE TABLE `pertainymy` (
145 `noun_id` INTEGER NOT NULL, 115 `noun_id` INTEGER NOT NULL,
146 `pertainym_id` INTEGER NOT NULL, 116 `pertainym_id` INTEGER NOT NULL
147 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
148 FOREIGN KEY (`pertainym_id`) REFERENCES `adjectives`(`adjective_id`)
149); 117);
150 118
151DROP TABLE IF EXISTS `mannernymy`; 119CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`);
120CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`);
121
152CREATE TABLE `mannernymy` ( 122CREATE TABLE `mannernymy` (
153 `adjective_id` INTEGER NOT NULL, 123 `adjective_id` INTEGER NOT NULL,
154 `mannernym_id` INTEGER NOT NULL, 124 `mannernym_id` INTEGER NOT NULL
155 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`),
156 FOREIGN KEY (`mannernym_id`) REFERENCES `adverbs`(`adverb_id`)
157); 125);
158 126
159DROP TABLE IF EXISTS `noun_synonymy`; 127CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`);
160CREATE TABLE `noun_synonymy` ( 128CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`);
161 `noun_1_id` INTEGER NOT NULL,
162 `noun_2_id` INTEGER NOT NULL,
163 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`nouns_id`),
164 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`nouns_id`)
165);
166 129
167DROP TABLE IF EXISTS `adjective_synonymy`; 130CREATE TABLE `usage` (
168CREATE TABLE `adjective_synonymy` ( 131 `domain_id` INTEGER NOT NULL,
169 `adjective_1_id` INTEGER NOT NULL, 132 `term_id` INTEGER NOT NULL
170 `adjective_2_id` INTEGER NOT NULL,
171 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`),
172 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`)
173); 133);
174 134
175DROP TABLE IF EXISTS `adverb_synonymy`; 135CREATE INDEX `usage_term_of` ON `usage`(`domain_id`);
176CREATE TABLE `adverb_synonymy` ( 136CREATE INDEX `usage_domain_of` ON `usage`(`term_id`);
177 `adverb_1_id` INTEGER NOT NULL,
178 `adverb_2_id` INTEGER NOT NULL,
179 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`),
180 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`)
181);
182 137
183DROP TABLE IF EXISTS `noun_pronunciations`; 138CREATE TABLE `topicality` (
184CREATE TABLE `noun_pronunciations` ( 139 `domain_id` INTEGER NOT NULL,
185 `noun_id` INTEGER NOT NULL, 140 `term_id` INTEGER NOT NULL
186 `pronunciation` VARCHAR(64) NOT NULL,
187 `prerhyme` VARCHAR(8),
188 `rhyme` VARCHAR(64),
189 `syllables` INT NOT NULL,
190 `stress` VARCHAR(64) NOT NULL,
191 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`)
192); 141);
193 142
194DROP TABLE IF EXISTS `verb_pronunciations`; 143CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`);
195CREATE TABLE `verb_pronunciations` ( 144CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`);
196 `verb_id` INTEGER NOT NULL,
197 `pronunciation` VARCHAR(64) NOT NULL,
198 `prerhyme` VARCHAR(8),
199 `rhyme` VARCHAR(64),
200 `syllables` INT NOT NULL,
201 `stress` VARCHAR(64) NOT NULL,
202 FOREIGN KEY (`verb_id`) REFERENCES `verbs`(`verb_id`)
203);
204 145
205DROP TABLE IF EXISTS `adjective_pronunciations`; 146CREATE TABLE `regionality` (
206CREATE TABLE `adjective_pronunciations` ( 147 `domain_id` INTEGER NOT NULL,
207 `adjective_id` INTEGER NOT NULL, 148 `term_id` INTEGER NOT NULL
208 `pronunciation` VARCHAR(64) NOT NULL,
209 `prerhyme` VARCHAR(8),
210 `rhyme` VARCHAR(64),
211 `syllables` INT NOT NULL,
212 `stress` VARCHAR(64) NOT NULL,
213 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
214); 149);
215 150
216DROP TABLE IF EXISTS `adverb_pronunciations`; 151CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`);
217CREATE TABLE `adverb_pronunciations` ( 152CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`);
218 `adverb_id` INTEGER NOT NULL,
219 `pronunciation` VARCHAR(64) NOT NULL,
220 `prerhyme` VARCHAR(8),
221 `rhyme` VARCHAR(64),
222 `syllables` INT NOT NULL,
223 `stress` VARCHAR(64) NOT NULL,
224 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`)
225);
226 153
227DROP TABLE IF EXISTS `noun_noun_derivation`; 154CREATE TABLE `forms` (
228CREATE TABLE `noun_noun_derivation` ( 155 `form_id` INTEGER PRIMARY KEY,
229 `noun_1_id` INTEGER NOT NULL, 156 `form` VARCHAR(32) NOT NULL,
230 `noun_2_id` INTEGER NOT NULL, 157 `complexity` SMALLINT NOT NULL,
231 FOREIGN KEY (`noun_1_id`) REFERENCES `nouns`(`noun_id`), 158 `proper` SMALLINT NOT NULL
232 FOREIGN KEY (`noun_2_id`) REFERENCES `nouns`(`noun_id`)
233); 159);
234 160
235DROP TABLE IF EXISTS `noun_adjective_derivation`; 161CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`);
236CREATE TABLE `noun_adjective_derivation` (
237 `noun_id` INTEGER NOT NULL,
238 `adjective_id` INTEGER NOT NULL,
239 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
240 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`)
241);
242 162
243DROP TABLE IF EXISTS `noun_adverb_derivation`; 163CREATE TABLE `lemmas_forms` (
244CREATE TABLE `noun_adverb_derivation` ( 164 `lemma_id` INTEGER NOT NULL,
245 `noun_id` INTEGER NOT NULL, 165 `form_id` INTEGER NOT NULL,
246 `adverb_id` INTEGER NOT NULL, 166 `category` SMALLINT NOT NULL
247 FOREIGN KEY (`noun_id`) REFERENCES `nouns`(`noun_id`),
248 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adverb_id`)
249); 167);
250 168
251DROP TABLE IF EXISTS `adjective_adjective_derivation`; 169CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`);
252CREATE TABLE `adjective_adjective_derivation` ( 170CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`);
253 `adjective_1_id` INTEGER NOT NULL, 171
254 `adjective_2_id` INTEGER NOT NULL, 172CREATE TABLE `pronunciations` (
255 FOREIGN KEY (`adjective_1_id`) REFERENCES `adjectives`(`adjective_id`), 173 `pronunciation_id` INTEGER PRIMARY KEY,
256 FOREIGN KEY (`adjective_2_id`) REFERENCES `adjectives`(`adjective_id`) 174 `phonemes` VARCHAR(64) NOT NULL,
175 `prerhyme` VARCHAR(8),
176 `rhyme` VARCHAR(64),
177 `syllables` INTEGER NOT NULL,
178 `stress` VARCHAR(64) NOT NULL
257); 179);
258 180
259DROP TABLE IF EXISTS `adjective_adverb_derivation`; 181CREATE TABLE `forms_pronunciations` (
260CREATE TABLE `adjective_adverb_derivation` ( 182 `form_id` INTEGER NOT NULL,
261 `adjective_id` INTEGER NOT NULL, 183 `pronunciation_id` INTEGER NOT NULL
262 `adverb_id` INTEGER NOT NULL,
263 FOREIGN KEY (`adjective_id`) REFERENCES `adjectives`(`adjective_id`),
264 FOREIGN KEY (`adverb_id`) REFERENCES `adverbs`(`adjective_id`)
265); 184);
266 185
267DROP TABLE IF EXISTS `adverb_adverb_derivation`; 186CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`);
268CREATE TABLE `adverb_adverb_derivation` ( 187CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`);
269 `adverb_1_id` INTEGER NOT NULL, 188
270 `adverb_2_id` INTEGER NOT NULL, 189CREATE TABLE `groups` (
271 FOREIGN KEY (`adverb_1_id`) REFERENCES `adverbs`(`adverb_id`), 190 `group_id` INTEGER PRIMARY KEY,
272 FOREIGN KEY (`adverb_2_id`) REFERENCES `adverbs`(`adverb_id`) 191 `data` BLOB NOT NULL
273); 192);
274 193
275DROP TABLE IF EXISTS `prepositions`; 194CREATE TABLE `frames` (
276CREATE TABLE `prepositions` ( 195 `frame_id` INTEGER PRIMARY KEY,
277 `preposition_id` INTEGER PRIMARY KEY, 196 `data` BLOB NOT NULL
278 `form` VARCHAR(32) NOT NULL
279); 197);
280 198
281DROP TABLE IF EXISTS `preposition_groups`; 199CREATE TABLE `groups_frames` (
282CREATE TABLE `preposition_groups` ( 200 `group_id` INTEGER NOT NULL,
283 `preposition_id` INTEGER NOT NULL, 201 `frame_id` INTEGER NOT NULL
284 `groupname` VARCHAR(32) NOT NULL,
285 FOREIGN KEY (`preposition_id`) REFERENCES `prepositions`(`preposition_id`)
286); 202);
203
204CREATE INDEX `frames_in` ON `groups_frames`(`group_id`);
diff --git a/generator/selrestr.cpp b/generator/selrestr.cpp new file mode 100644 index 0000000..8bdd3f6 --- /dev/null +++ b/generator/selrestr.cpp
@@ -0,0 +1,288 @@
1#include "selrestr.h"
2
3namespace verbly {
4 namespace generator {
5
6 selrestr::selrestr(const selrestr& other)
7 {
8 type_ = other.type_;
9
10 switch (type_)
11 {
12 case type::singleton:
13 {
14 singleton_.pos = other.singleton_.pos;
15 new(&singleton_.restriction) std::string(other.singleton_.restriction);
16
17 break;
18 }
19
20 case type::group:
21 {
22 new(&group_.children) std::list<selrestr>(other.group_.children);
23 group_.orlogic = other.group_.orlogic;
24
25 break;
26 }
27
28 case type::empty:
29 {
30 break;
31 }
32 }
33 }
34
35 selrestr::selrestr(selrestr&& other) : selrestr()
36 {
37 swap(*this, other);
38 }
39
40 selrestr& selrestr::operator=(selrestr other)
41 {
42 swap(*this, other);
43
44 return *this;
45 }
46
47 void swap(selrestr& first, selrestr& second)
48 {
49 using type = selrestr::type;
50
51 type tempType = first.type_;
52 int tempPos;
53 std::string tempRestriction;
54 std::list<selrestr> tempChildren;
55 bool tempOrlogic;
56
57 switch (tempType)
58 {
59 case type::singleton:
60 {
61 tempPos = first.singleton_.pos;
62 tempRestriction = std::move(first.singleton_.restriction);
63
64 break;
65 }
66
67 case type::group:
68 {
69 tempChildren = std::move(first.group_.children);
70 tempOrlogic = first.group_.orlogic;
71
72 break;
73 }
74
75 case type::empty:
76 {
77 break;
78 }
79 }
80
81 first.~selrestr();
82
83 first.type_ = second.type_;
84
85 switch (first.type_)
86 {
87 case type::singleton:
88 {
89 first.singleton_.pos = second.singleton_.pos;
90 new(&first.singleton_.restriction) std::string(std::move(second.singleton_.restriction));
91
92 break;
93 }
94
95 case type::group:
96 {
97 new(&first.group_.children) std::list<selrestr>(std::move(second.group_.children));
98 first.group_.orlogic = second.group_.orlogic;
99
100 break;
101 }
102
103 case type::empty:
104 {
105 break;
106 }
107 }
108
109 second.~selrestr();
110
111 second.type_ = tempType;
112
113 switch (second.type_)
114 {
115 case type::singleton:
116 {
117 second.singleton_.pos = tempPos;
118 new(&second.singleton_.restriction) std::string(std::move(tempRestriction));
119
120 break;
121 }
122
123 case type::group:
124 {
125 new(&second.group_.children) std::list<selrestr>(std::move(tempChildren));
126 second.group_.orlogic = tempOrlogic;
127
128 break;
129 }
130
131 case type::empty:
132 {
133 break;
134 }
135 }
136 }
137
138 selrestr::~selrestr()
139 {
140 switch (type_)
141 {
142 case type::singleton:
143 {
144 using string_type = std::string;
145 singleton_.restriction.~string_type();
146
147 break;
148 }
149
150 case type::group:
151 {
152 using list_type = std::list<selrestr>;
153 group_.children.~list_type();
154
155 break;
156 }
157
158 case type::empty:
159 {
160 break;
161 }
162 }
163 }
164
165 selrestr::selrestr() : type_(type::empty)
166 {
167 }
168
169 selrestr::selrestr(
170 std::string restriction,
171 bool pos) :
172 type_(type::singleton)
173 {
174 new(&singleton_.restriction) std::string(std::move(restriction));
175 singleton_.pos = pos;
176 }
177
178 std::string selrestr::getRestriction() const
179 {
180 if (type_ == type::singleton)
181 {
182 return singleton_.restriction;
183 } else {
184 throw std::domain_error("Only singleton selrestrs have restrictions");
185 }
186 }
187
188 bool selrestr::getPos() const
189 {
190 if (type_ == type::singleton)
191 {
192 return singleton_.pos;
193 } else {
194 throw std::domain_error("Only singleton selrestrs have positivity flags");
195 }
196 }
197
198 selrestr::selrestr(
199 std::list<selrestr> children,
200 bool orlogic) :
201 type_(type::group)
202 {
203 new(&group_.children) std::list<selrestr>(std::move(children));
204 group_.orlogic = orlogic;
205 }
206
207 std::list<selrestr> selrestr::getChildren() const
208 {
209 if (type_ == type::group)
210 {
211 return group_.children;
212 } else {
213 throw std::domain_error("Only group selrestrs have children");
214 }
215 }
216
217 std::list<selrestr>::const_iterator selrestr::begin() const
218 {
219 if (type_ == type::group)
220 {
221 return std::begin(group_.children);
222 } else {
223 throw std::domain_error("Only group selrestrs have children");
224 }
225 }
226
227 std::list<selrestr>::const_iterator selrestr::end() const
228 {
229 if (type_ == type::group)
230 {
231 return std::end(group_.children);
232 } else {
233 throw std::domain_error("Only group selrestrs have children");
234 }
235 }
236
237 bool selrestr::getOrlogic() const
238 {
239 if (type_ == type::group)
240 {
241 return group_.orlogic;
242 } else {
243 throw std::domain_error("Only group selrestrs have logic");
244 }
245 }
246
247 nlohmann::json selrestr::toJson() const
248 {
249 switch (type_)
250 {
251 case type::empty:
252 {
253 return {};
254 }
255
256 case type::singleton:
257 {
258 return {
259 {"type", singleton_.restriction},
260 {"pos", singleton_.pos}
261 };
262 }
263
264 case type::group:
265 {
266 std::string logic;
267 if (group_.orlogic)
268 {
269 logic = "or";
270 } else {
271 logic = "and";
272 }
273
274 std::list<nlohmann::json> children;
275 std::transform(std::begin(group_.children), std::end(group_.children), std::back_inserter(children), [] (const selrestr& child) {
276 return child.toJson();
277 });
278
279 return {
280 {"logic", logic},
281 {"children", children}
282 };
283 }
284 }
285 }
286
287 };
288};
diff --git a/generator/selrestr.h b/generator/selrestr.h new file mode 100644 index 0000000..5000970 --- /dev/null +++ b/generator/selrestr.h
@@ -0,0 +1,88 @@
1#ifndef SELRESTR_H_50652FB7
2#define SELRESTR_H_50652FB7
3
4#include <list>
5#include <string>
6#include <json.hpp>
7
8namespace verbly {
9 namespace generator {
10
11 class selrestr {
12 public:
13 enum class type {
14 empty,
15 singleton,
16 group
17 };
18
19 // Copy and move constructors
20
21 selrestr(const selrestr& other);
22 selrestr(selrestr&& other);
23
24 // Assignment
25
26 selrestr& operator=(selrestr other);
27
28 // Swap
29
30 friend void swap(selrestr& first, selrestr& second);
31
32 // Destructor
33
34 ~selrestr();
35
36 // Generic accessors
37
38 type getType() const
39 {
40 return type_;
41 }
42
43 // Empty
44
45 selrestr();
46
47 // Singleton
48
49 selrestr(std::string restriction, bool pos);
50
51 std::string getRestriction() const;
52
53 bool getPos() const;
54
55 // Group
56
57 selrestr(std::list<selrestr> children, bool orlogic);
58
59 std::list<selrestr> getChildren() const;
60
61 std::list<selrestr>::const_iterator begin() const;
62
63 std::list<selrestr>::const_iterator end() const;
64
65 bool getOrlogic() const;
66
67 // Helpers
68
69 nlohmann::json toJson() const;
70
71 private:
72 union {
73 struct {
74 bool pos;
75 std::string restriction;
76 } singleton_;
77 struct {
78 std::list<selrestr> children;
79 bool orlogic;
80 } group_;
81 };
82 type type_;
83 };
84
85 };
86};
87
88#endif /* end of include guard: SELRESTR_H_50652FB7 */
diff --git a/generator/word.cpp b/generator/word.cpp new file mode 100644 index 0000000..8ba3ce2 --- /dev/null +++ b/generator/word.cpp
@@ -0,0 +1,77 @@
1#include "word.h"
2#include <list>
3#include <string>
4#include "database.h"
5#include "notion.h"
6#include "lemma.h"
7#include "field.h"
8#include "group.h"
9
10namespace verbly {
11 namespace generator {
12
13 int word::nextId_ = 0;
14
15 word::word(
16 notion& n,
17 lemma& l) :
18 id_(nextId_++),
19 notion_(n),
20 lemma_(l)
21 {
22 }
23
24 word::word(
25 notion& n,
26 lemma& l,
27 int tagCount) :
28 id_(nextId_++),
29 notion_(n),
30 lemma_(l),
31 tagCount_(tagCount),
32 hasTagCount_(true)
33 {
34 }
35
36 void word::setAdjectivePosition(positioning adjectivePosition)
37 {
38 adjectivePosition_ = adjectivePosition;
39 }
40
41 void word::setVerbGroup(const group& verbGroup)
42 {
43 verbGroup_ = &verbGroup;
44 }
45
46 database& operator<<(database& db, const word& arg)
47 {
48 std::list<field> fields;
49
50 fields.emplace_back("word_id", arg.getId());
51 fields.emplace_back("notion_id", arg.getNotion().getId());
52 fields.emplace_back("lemma_id", arg.getLemma().getId());
53
54 if (arg.hasTagCount())
55 {
56 fields.emplace_back("tag_count", arg.getTagCount());
57 }
58
59 if ((arg.getNotion().getPartOfSpeech() == part_of_speech::adjective)
60 && (arg.getAdjectivePosition() != positioning::undefined))
61 {
62 fields.emplace_back("position", static_cast<int>(arg.getAdjectivePosition()));
63 }
64
65 if ((arg.getNotion().getPartOfSpeech() == part_of_speech::verb)
66 && (arg.hasVerbGroup()))
67 {
68 fields.emplace_back("group_id", arg.getVerbGroup().getId());
69 }
70
71 db.insertIntoTable("words", std::move(fields));
72
73 return db;
74 }
75
76 };
77};
diff --git a/generator/word.h b/generator/word.h new file mode 100644 index 0000000..bfed586 --- /dev/null +++ b/generator/word.h
@@ -0,0 +1,110 @@
1#ifndef WORD_H_91F99D46
2#define WORD_H_91F99D46
3
4#include <cassert>
5#include "enums.h"
6
7namespace verbly {
8 namespace generator {
9
10 class notion;
11 class lemma;
12 class database;
13 class group;
14
15 class word {
16 public:
17
18 // Constructors
19
20 word(notion& n, lemma& l);
21
22 word(notion& n, lemma& l, int tagCount);
23
24 // Mutators
25
26 void setAdjectivePosition(positioning adjectivePosition);
27
28 void setVerbGroup(const group& verbGroup);
29
30 // Accessors
31
32 int getId() const
33 {
34 return id_;
35 }
36
37 notion& getNotion()
38 {
39 return notion_;
40 }
41
42 const notion& getNotion() const
43 {
44 return notion_;
45 }
46
47 lemma& getLemma()
48 {
49 return lemma_;
50 }
51
52 const lemma& getLemma() const
53 {
54 return lemma_;
55 }
56
57 bool hasTagCount() const
58 {
59 return hasTagCount_;
60 }
61
62 int getTagCount() const
63 {
64 // Calling code should always call hasTagCount first.
65 assert(hasTagCount_);
66
67 return tagCount_;
68 }
69
70 positioning getAdjectivePosition() const
71 {
72 return adjectivePosition_;
73 }
74
75 bool hasVerbGroup() const
76 {
77 return (verbGroup_ != nullptr);
78 }
79
80 const group& getVerbGroup() const
81 {
82 // Calling code should always call hasVerbGroup first.
83 assert(verbGroup_ != nullptr);
84
85 return *verbGroup_;
86 }
87
88 private:
89
90 static int nextId_;
91
92 const int id_;
93 notion& notion_;
94 lemma& lemma_;
95 const int tagCount_ = 0;
96 const bool hasTagCount_ = false;
97
98 positioning adjectivePosition_ = positioning::undefined;
99 const group* verbGroup_ = nullptr;
100
101 };
102
103 // Serializer
104
105 database& operator<<(database& db, const word& arg);
106
107 };
108};
109
110#endif /* end of include guard: WORD_H_91F99D46 */