diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-09-26 21:40:44 -0400 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2018-09-26 21:40:44 -0400 |
commit | 3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3 (patch) | |
tree | e4478f57fe5e3faa57274d3b79c636ea657ab3a0 /generator | |
parent | a9188cbc6b3b9d26e675213e3834afdbd06296f6 (diff) | |
download | verbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.tar.gz verbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.tar.bz2 verbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.zip |
Removed unnecessary ROWIDs from database schema
The generator also now sorts and uniq's the WordNet files for antonymy, classification, and pertainymy/mannernymy, because those files contained duplicate rows, and the join tables without ROWIDs now enforce a uniqueness constraint. This constitutes a minor database update -- the new database is compatible with d1.0, but is ~12MB smaller. refs #6
Diffstat (limited to 'generator')
-rw-r--r-- | generator/generator.cpp | 21 | ||||
-rw-r--r-- | generator/generator.h | 2 | ||||
-rw-r--r-- | generator/schema.sql | 147 |
3 files changed, 91 insertions, 79 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index e52aa90..0d073be 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -696,7 +696,8 @@ namespace verbly { | |||
696 | 696 | ||
697 | void generator::readWordNetAntonymy() | 697 | void generator::readWordNetAntonymy() |
698 | { | 698 | { |
699 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); | 699 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl", true)); |
700 | |||
700 | hatkirby::progress ppgs("Writing antonyms...", lines.size()); | 701 | hatkirby::progress ppgs("Writing antonyms...", lines.size()); |
701 | for (auto line : lines) | 702 | for (auto line : lines) |
702 | { | 703 | { |
@@ -770,7 +771,7 @@ namespace verbly { | |||
770 | 771 | ||
771 | void generator::readWordNetClasses() | 772 | void generator::readWordNetClasses() |
772 | { | 773 | { |
773 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); | 774 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl", true)); |
774 | 775 | ||
775 | hatkirby::progress ppgs( | 776 | hatkirby::progress ppgs( |
776 | "Writing usage, topicality, and regionality...", | 777 | "Writing usage, topicality, and regionality...", |
@@ -1092,7 +1093,7 @@ namespace verbly { | |||
1092 | 1093 | ||
1093 | void generator::readWordNetPertainymy() | 1094 | void generator::readWordNetPertainymy() |
1094 | { | 1095 | { |
1095 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); | 1096 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl", true)); |
1096 | 1097 | ||
1097 | hatkirby::progress ppgs( | 1098 | hatkirby::progress ppgs( |
1098 | "Writing pertainymy and mannernymy...", | 1099 | "Writing pertainymy and mannernymy...", |
@@ -1228,7 +1229,7 @@ namespace verbly { | |||
1228 | db_.execute("ANALYZE"); | 1229 | db_.execute("ANALYZE"); |
1229 | } | 1230 | } |
1230 | 1231 | ||
1231 | std::list<std::string> generator::readFile(std::string path) | 1232 | std::list<std::string> generator::readFile(std::string path, bool uniq) |
1232 | { | 1233 | { |
1233 | std::ifstream file(path); | 1234 | std::ifstream file(path); |
1234 | if (!file) | 1235 | if (!file) |
@@ -1248,6 +1249,18 @@ namespace verbly { | |||
1248 | lines.push_back(line); | 1249 | lines.push_back(line); |
1249 | } | 1250 | } |
1250 | 1251 | ||
1252 | if (uniq) | ||
1253 | { | ||
1254 | std::vector<std::string> uniq(std::begin(lines), std::end(lines)); | ||
1255 | lines.clear(); | ||
1256 | |||
1257 | std::sort(std::begin(uniq), std::end(uniq)); | ||
1258 | std::unique_copy( | ||
1259 | std::begin(uniq), | ||
1260 | std::end(uniq), | ||
1261 | std::back_inserter(lines)); | ||
1262 | } | ||
1263 | |||
1251 | return lines; | 1264 | return lines; |
1252 | } | 1265 | } |
1253 | 1266 | ||
diff --git a/generator/generator.h b/generator/generator.h index cd99f88..1547b7c 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
@@ -94,7 +94,7 @@ namespace verbly { | |||
94 | 94 | ||
95 | // Helpers | 95 | // Helpers |
96 | 96 | ||
97 | std::list<std::string> readFile(std::string path); | 97 | std::list<std::string> readFile(std::string path, bool uniq = false); |
98 | 98 | ||
99 | inline part_of_speech partOfSpeechByWnid(int wnid); | 99 | inline part_of_speech partOfSpeechByWnid(int wnid); |
100 | 100 | ||
diff --git a/generator/schema.sql b/generator/schema.sql index d97c06e..6a7d223 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
@@ -14,79 +14,79 @@ CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`); | |||
14 | 14 | ||
15 | CREATE TABLE `hypernymy` ( | 15 | CREATE TABLE `hypernymy` ( |
16 | `hypernym_id` INTEGER NOT NULL, | 16 | `hypernym_id` INTEGER NOT NULL, |
17 | `hyponym_id` INTEGER NOT NULL | 17 | `hyponym_id` INTEGER NOT NULL, |
18 | ); | 18 | PRIMARY KEY (`hypernym_id`,`hyponym_id`) |
19 | ) WITHOUT ROWID; | ||
19 | 20 | ||
20 | CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`,`hyponym_id`); | 21 | CREATE INDEX `reverse_hypernymy` ON `hypernymy`(`hyponym_id`,`hypernym_id`); |
21 | CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`,`hypernym_id`); | ||
22 | 22 | ||
23 | CREATE TABLE `instantiation` ( | 23 | CREATE TABLE `instantiation` ( |
24 | `class_id` INTEGER NOT NULL, | 24 | `class_id` INTEGER NOT NULL, |
25 | `instance_id` INTEGER NOT NULL | 25 | `instance_id` INTEGER NOT NULL, |
26 | ); | 26 | PRIMARY KEY (`class_id`,`instance_id`) |
27 | ) WITHOUT ROWID; | ||
27 | 28 | ||
28 | CREATE INDEX `instance_of` ON `instantiation`(`class_id`,`instance_id`); | 29 | CREATE INDEX `reverse_instantiation` ON `instantiation`(`instance_id`,`class_id`); |
29 | CREATE INDEX `class_of` ON `instantiation`(`instance_id`,`class_id`); | ||
30 | 30 | ||
31 | CREATE TABLE `member_meronymy` ( | 31 | CREATE TABLE `member_meronymy` ( |
32 | `meronym_id` INTEGER NOT NULL, | 32 | `meronym_id` INTEGER NOT NULL, |
33 | `holonym_id` INTEGER NOT NULL | 33 | `holonym_id` INTEGER NOT NULL, |
34 | ); | 34 | PRIMARY KEY (`meronym_id`,`holonym_id`) |
35 | ) WITHOUT ROWID; | ||
35 | 36 | ||
36 | CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`,`holonym_id`); | 37 | CREATE INDEX `reverse_member_meronymy` ON `member_meronymy`(`holonym_id`,`meronym_id`); |
37 | CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`,`meronym_id`); | ||
38 | 38 | ||
39 | CREATE TABLE `part_meronymy` ( | 39 | CREATE TABLE `part_meronymy` ( |
40 | `meronym_id` INTEGER NOT NULL, | 40 | `meronym_id` INTEGER NOT NULL, |
41 | `holonym_id` INTEGER NOT NULL | 41 | `holonym_id` INTEGER NOT NULL, |
42 | ); | 42 | PRIMARY KEY (`meronym_id`,`holonym_id`) |
43 | ) WITHOUT ROWID; | ||
43 | 44 | ||
44 | CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`,`holonym_id`); | 45 | CREATE INDEX `reverse_part_meronymy` ON `part_meronymy`(`holonym_id`,`meronym_id`); |
45 | CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`,`meronym_id`); | ||
46 | 46 | ||
47 | CREATE TABLE `substance_meronymy` ( | 47 | CREATE TABLE `substance_meronymy` ( |
48 | `meronym_id` INTEGER NOT NULL, | 48 | `meronym_id` INTEGER NOT NULL, |
49 | `holonym_id` INTEGER NOT NULL | 49 | `holonym_id` INTEGER NOT NULL, |
50 | ); | 50 | PRIMARY KEY (`meronym_id`,`holonym_id`) |
51 | ) WITHOUT ROWID; | ||
51 | 52 | ||
52 | CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`,`holonym_id`); | 53 | CREATE INDEX `reverse_substance_meronymy` ON `substance_meronymy`(`holonym_id`,`meronym_id`); |
53 | CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`,`meronym_id`); | ||
54 | 54 | ||
55 | CREATE TABLE `variation` ( | 55 | CREATE TABLE `variation` ( |
56 | `noun_id` INTEGER NOT NULL, | 56 | `noun_id` INTEGER NOT NULL, |
57 | `adjective_id` INTEGER NOT NULL | 57 | `adjective_id` INTEGER NOT NULL, |
58 | ); | 58 | PRIMARY KEY (`noun_id`,`adjective_id`) |
59 | ) WITHOUT ROWID; | ||
59 | 60 | ||
60 | CREATE INDEX `variant_of` ON `variation`(`noun_id`,`adjective_id`); | 61 | CREATE INDEX `reverse_variation` ON `variation`(`adjective_id`,`noun_id`); |
61 | CREATE INDEX `attribute_of` ON `variation`(`adjective_id`,`noun_id`); | ||
62 | 62 | ||
63 | CREATE TABLE `similarity` ( | 63 | CREATE TABLE `similarity` ( |
64 | `adjective_1_id` INTEGER NOT NULL, | 64 | `adjective_1_id` INTEGER NOT NULL, |
65 | `adjective_2_id` INTEGER NOT NULL | 65 | `adjective_2_id` INTEGER NOT NULL, |
66 | ); | 66 | PRIMARY KEY (`adjective_1_id`,`adjective_2_id`) |
67 | 67 | ) WITHOUT ROWID; | |
68 | CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`,`adjective_2_id`); | ||
69 | 68 | ||
70 | CREATE TABLE `is_a` ( | 69 | CREATE TABLE `is_a` ( |
71 | `notion_id` INTEGER NOT NULL, | 70 | `notion_id` INTEGER NOT NULL, |
72 | `groupname` VARCHAR(32) NOT NULL | 71 | `groupname` VARCHAR(32) NOT NULL, |
73 | ); | 72 | PRIMARY KEY (`notion_id`,`groupname`) |
73 | ) WITHOUT ROWID; | ||
74 | 74 | ||
75 | CREATE TABLE `entailment` ( | 75 | CREATE TABLE `entailment` ( |
76 | `given_id` INTEGER NOT NULL, | 76 | `given_id` INTEGER NOT NULL, |
77 | `entailment_id` INTEGER NOT NULL | 77 | `entailment_id` INTEGER NOT NULL, |
78 | ); | 78 | PRIMARY KEY (`given_id`,`entailment_id`) |
79 | ) WITHOUT ROWID; | ||
79 | 80 | ||
80 | CREATE INDEX `entailment_of` ON `entailment`(`given_id`,`entailment_id`); | 81 | CREATE INDEX `reverse_entailment` ON `entailment`(`entailment_id`,`given_id`); |
81 | CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`,`given_id`); | ||
82 | 82 | ||
83 | CREATE TABLE `causality` ( | 83 | CREATE TABLE `causality` ( |
84 | `cause_id` INTEGER NOT NULL, | 84 | `cause_id` INTEGER NOT NULL, |
85 | `effect_id` INTEGER NOT NULL | 85 | `effect_id` INTEGER NOT NULL, |
86 | ); | 86 | PRIMARY KEY (`cause_id`,`effect_id`) |
87 | ) WITHOUT ROWID; | ||
87 | 88 | ||
88 | CREATE INDEX `effect_of` ON `causality`(`cause_id`,`effect_id`); | 89 | CREATE INDEX `reverse_causality` ON `causality`(`effect_id`,`cause_id`); |
89 | CREATE INDEX `cause_of` ON `causality`(`effect_id`,`cause_id`); | ||
90 | 90 | ||
91 | CREATE TABLE `words` ( | 91 | CREATE TABLE `words` ( |
92 | `word_id` INTEGER PRIMARY KEY, | 92 | `word_id` INTEGER PRIMARY KEY, |
@@ -103,58 +103,57 @@ CREATE INDEX `group_words` ON `words`(`group_id`); | |||
103 | 103 | ||
104 | CREATE TABLE `antonymy` ( | 104 | CREATE TABLE `antonymy` ( |
105 | `antonym_1_id` INTEGER NOT NULL, | 105 | `antonym_1_id` INTEGER NOT NULL, |
106 | `antonym_2_id` INTEGER NOT NULL | 106 | `antonym_2_id` INTEGER NOT NULL, |
107 | ); | 107 | PRIMARY KEY (`antonym_1_id`,`antonym_2_id`) |
108 | 108 | ) WITHOUT ROWID; | |
109 | CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`,`antonym_2_id`); | ||
110 | 109 | ||
111 | CREATE TABLE `specification` ( | 110 | CREATE TABLE `specification` ( |
112 | `general_id` INTEGER NOT NULL, | 111 | `general_id` INTEGER NOT NULL, |
113 | `specific_id` INTEGER NOT NULL | 112 | `specific_id` INTEGER NOT NULL, |
114 | ); | 113 | PRIMARY KEY (`general_id`,`specific_id`) |
114 | ) WITHOUT ROWID; | ||
115 | 115 | ||
116 | CREATE INDEX `specification_of` ON `specification`(`general_id`,`specific_id`); | 116 | CREATE INDEX `reverse_specification` ON `specification`(`specific_id`,`general_id`); |
117 | CREATE INDEX `generalization_of` ON `specification`(`specific_id`,`general_id`); | ||
118 | 117 | ||
119 | CREATE TABLE `pertainymy` ( | 118 | CREATE TABLE `pertainymy` ( |
120 | `noun_id` INTEGER NOT NULL, | 119 | `noun_id` INTEGER NOT NULL, |
121 | `pertainym_id` INTEGER NOT NULL | 120 | `pertainym_id` INTEGER NOT NULL, |
122 | ); | 121 | PRIMARY KEY (`noun_id`,`pertainym_id`) |
122 | ) WITHOUT ROWID; | ||
123 | 123 | ||
124 | CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`,`pertainym_id`); | 124 | CREATE INDEX `reverse_pertainymy` ON `pertainymy`(`pertainym_id`,`noun_id`); |
125 | CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`,`noun_id`); | ||
126 | 125 | ||
127 | CREATE TABLE `mannernymy` ( | 126 | CREATE TABLE `mannernymy` ( |
128 | `adjective_id` INTEGER NOT NULL, | 127 | `adjective_id` INTEGER NOT NULL, |
129 | `mannernym_id` INTEGER NOT NULL | 128 | `mannernym_id` INTEGER NOT NULL, |
130 | ); | 129 | PRIMARY KEY (`adjective_id`,`mannernym_id`) |
130 | ) WITHOUT ROWID; | ||
131 | 131 | ||
132 | CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`,`mannernym_id`); | 132 | CREATE INDEX `reverse_mannernymy` ON `mannernymy`(`mannernym_id`,`adjective_id`); |
133 | CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`,`adjective_id`); | ||
134 | 133 | ||
135 | CREATE TABLE `usage` ( | 134 | CREATE TABLE `usage` ( |
136 | `domain_id` INTEGER NOT NULL, | 135 | `domain_id` INTEGER NOT NULL, |
137 | `term_id` INTEGER NOT NULL | 136 | `term_id` INTEGER NOT NULL, |
138 | ); | 137 | PRIMARY KEY (`domain_id`,`term_id`) |
138 | ) WITHOUT ROWID; | ||
139 | 139 | ||
140 | CREATE INDEX `usage_term_of` ON `usage`(`domain_id`,`term_id`); | 140 | CREATE INDEX `reverse_usage` ON `usage`(`term_id`,`domain_id`); |
141 | CREATE INDEX `usage_domain_of` ON `usage`(`term_id`,`domain_id`); | ||
142 | 141 | ||
143 | CREATE TABLE `topicality` ( | 142 | CREATE TABLE `topicality` ( |
144 | `domain_id` INTEGER NOT NULL, | 143 | `domain_id` INTEGER NOT NULL, |
145 | `term_id` INTEGER NOT NULL | 144 | `term_id` INTEGER NOT NULL, |
146 | ); | 145 | PRIMARY KEY (`domain_id`,`term_id`) |
146 | ) WITHOUT ROWID; | ||
147 | 147 | ||
148 | CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`,`term_id`); | 148 | CREATE INDEX `reverse_topicality` ON `topicality`(`term_id`,`domain_id`); |
149 | CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`,`domain_id`); | ||
150 | 149 | ||
151 | CREATE TABLE `regionality` ( | 150 | CREATE TABLE `regionality` ( |
152 | `domain_id` INTEGER NOT NULL, | 151 | `domain_id` INTEGER NOT NULL, |
153 | `term_id` INTEGER NOT NULL | 152 | `term_id` INTEGER NOT NULL, |
154 | ); | 153 | PRIMARY KEY (`domain_id`,`term_id`) |
154 | ) WITHOUT ROWID; | ||
155 | 155 | ||
156 | CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`,`term_id`); | 156 | CREATE INDEX `reverse_regionality` ON `regionality`(`term_id`,`domain_id`); |
157 | CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`,`domain_id`); | ||
158 | 157 | ||
159 | CREATE TABLE `forms` ( | 158 | CREATE TABLE `forms` ( |
160 | `form_id` INTEGER PRIMARY KEY, | 159 | `form_id` INTEGER PRIMARY KEY, |
@@ -169,11 +168,11 @@ CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); | |||
169 | CREATE TABLE `lemmas_forms` ( | 168 | CREATE TABLE `lemmas_forms` ( |
170 | `lemma_id` INTEGER NOT NULL, | 169 | `lemma_id` INTEGER NOT NULL, |
171 | `form_id` INTEGER NOT NULL, | 170 | `form_id` INTEGER NOT NULL, |
172 | `category` SMALLINT NOT NULL | 171 | `category` SMALLINT NOT NULL, |
173 | ); | 172 | PRIMARY KEY (`lemma_id`,`category`,`form_id`) |
173 | ) WITHOUT ROWID; | ||
174 | 174 | ||
175 | CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`,`category`,`form_id`); | 175 | CREATE INDEX `forms_lemmas` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`); |
176 | CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`); | ||
177 | 176 | ||
178 | CREATE TABLE `pronunciations` ( | 177 | CREATE TABLE `pronunciations` ( |
179 | `pronunciation_id` INTEGER PRIMARY KEY, | 178 | `pronunciation_id` INTEGER PRIMARY KEY, |
@@ -188,11 +187,11 @@ CREATE INDEX `rhymes_with` ON `pronunciations`(`rhyme`,`prerhyme`); | |||
188 | 187 | ||
189 | CREATE TABLE `forms_pronunciations` ( | 188 | CREATE TABLE `forms_pronunciations` ( |
190 | `form_id` INTEGER NOT NULL, | 189 | `form_id` INTEGER NOT NULL, |
191 | `pronunciation_id` INTEGER NOT NULL | 190 | `pronunciation_id` INTEGER NOT NULL, |
192 | ); | 191 | PRIMARY KEY (`form_id`,`pronunciation_id`) |
192 | ) WITHOUT ROWID; | ||
193 | 193 | ||
194 | CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`,`pronunciation_id`); | 194 | CREATE INDEX `pronunciations_forms` ON `forms_pronunciations`(`pronunciation_id`,`form_id`); |
195 | CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`,`form_id`); | ||
196 | 195 | ||
197 | CREATE TABLE `frames` ( | 196 | CREATE TABLE `frames` ( |
198 | `frame_id` INTEGER NOT NULL, | 197 | `frame_id` INTEGER NOT NULL, |