summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2018-09-26 21:40:44 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2018-09-26 21:40:44 -0400
commit3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3 (patch)
treee4478f57fe5e3faa57274d3b79c636ea657ab3a0
parenta9188cbc6b3b9d26e675213e3834afdbd06296f6 (diff)
downloadverbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.tar.gz
verbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.tar.bz2
verbly-3a8bfa95a5df04d97f05545d5bb8df5f3c3f96a3.zip
Removed unnecessary ROWIDs from database schema
The generator also now sorts and uniq's the WordNet files for antonymy, classification, and pertainymy/mannernymy, because those files contained duplicate rows, and the join tables without ROWIDs now enforce a uniqueness constraint.

This constitutes a minor database update -- the new database is compatible with d1.0, but is ~12MB smaller.

refs #6
-rw-r--r--generator/generator.cpp21
-rw-r--r--generator/generator.h2
-rw-r--r--generator/schema.sql147
-rw-r--r--lib/version.h2
4 files changed, 92 insertions, 80 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index e52aa90..0d073be 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -696,7 +696,8 @@ namespace verbly {
696 696
697 void generator::readWordNetAntonymy() 697 void generator::readWordNetAntonymy()
698 { 698 {
699 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl")); 699 std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl", true));
700
700 hatkirby::progress ppgs("Writing antonyms...", lines.size()); 701 hatkirby::progress ppgs("Writing antonyms...", lines.size());
701 for (auto line : lines) 702 for (auto line : lines)
702 { 703 {
@@ -770,7 +771,7 @@ namespace verbly {
770 771
771 void generator::readWordNetClasses() 772 void generator::readWordNetClasses()
772 { 773 {
773 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl")); 774 std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl", true));
774 775
775 hatkirby::progress ppgs( 776 hatkirby::progress ppgs(
776 "Writing usage, topicality, and regionality...", 777 "Writing usage, topicality, and regionality...",
@@ -1092,7 +1093,7 @@ namespace verbly {
1092 1093
1093 void generator::readWordNetPertainymy() 1094 void generator::readWordNetPertainymy()
1094 { 1095 {
1095 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl")); 1096 std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl", true));
1096 1097
1097 hatkirby::progress ppgs( 1098 hatkirby::progress ppgs(
1098 "Writing pertainymy and mannernymy...", 1099 "Writing pertainymy and mannernymy...",
@@ -1228,7 +1229,7 @@ namespace verbly {
1228 db_.execute("ANALYZE"); 1229 db_.execute("ANALYZE");
1229 } 1230 }
1230 1231
1231 std::list<std::string> generator::readFile(std::string path) 1232 std::list<std::string> generator::readFile(std::string path, bool uniq)
1232 { 1233 {
1233 std::ifstream file(path); 1234 std::ifstream file(path);
1234 if (!file) 1235 if (!file)
@@ -1248,6 +1249,18 @@ namespace verbly {
1248 lines.push_back(line); 1249 lines.push_back(line);
1249 } 1250 }
1250 1251
1252 if (uniq)
1253 {
1254 std::vector<std::string> uniq(std::begin(lines), std::end(lines));
1255 lines.clear();
1256
1257 std::sort(std::begin(uniq), std::end(uniq));
1258 std::unique_copy(
1259 std::begin(uniq),
1260 std::end(uniq),
1261 std::back_inserter(lines));
1262 }
1263
1251 return lines; 1264 return lines;
1252 } 1265 }
1253 1266
diff --git a/generator/generator.h b/generator/generator.h index cd99f88..1547b7c 100644 --- a/generator/generator.h +++ b/generator/generator.h
@@ -94,7 +94,7 @@ namespace verbly {
94 94
95 // Helpers 95 // Helpers
96 96
97 std::list<std::string> readFile(std::string path); 97 std::list<std::string> readFile(std::string path, bool uniq = false);
98 98
99 inline part_of_speech partOfSpeechByWnid(int wnid); 99 inline part_of_speech partOfSpeechByWnid(int wnid);
100 100
diff --git a/generator/schema.sql b/generator/schema.sql index d97c06e..6a7d223 100644 --- a/generator/schema.sql +++ b/generator/schema.sql
@@ -14,79 +14,79 @@ CREATE UNIQUE INDEX `notion_by_wnid` ON `notions`(`wnid`);
14 14
15CREATE TABLE `hypernymy` ( 15CREATE TABLE `hypernymy` (
16 `hypernym_id` INTEGER NOT NULL, 16 `hypernym_id` INTEGER NOT NULL,
17 `hyponym_id` INTEGER NOT NULL 17 `hyponym_id` INTEGER NOT NULL,
18); 18 PRIMARY KEY (`hypernym_id`,`hyponym_id`)
19) WITHOUT ROWID;
19 20
20CREATE INDEX `hyponym_of` ON `hypernymy`(`hypernym_id`,`hyponym_id`); 21CREATE INDEX `reverse_hypernymy` ON `hypernymy`(`hyponym_id`,`hypernym_id`);
21CREATE INDEX `hypernym_of` ON `hypernymy`(`hyponym_id`,`hypernym_id`);
22 22
23CREATE TABLE `instantiation` ( 23CREATE TABLE `instantiation` (
24 `class_id` INTEGER NOT NULL, 24 `class_id` INTEGER NOT NULL,
25 `instance_id` INTEGER NOT NULL 25 `instance_id` INTEGER NOT NULL,
26); 26 PRIMARY KEY (`class_id`,`instance_id`)
27) WITHOUT ROWID;
27 28
28CREATE INDEX `instance_of` ON `instantiation`(`class_id`,`instance_id`); 29CREATE INDEX `reverse_instantiation` ON `instantiation`(`instance_id`,`class_id`);
29CREATE INDEX `class_of` ON `instantiation`(`instance_id`,`class_id`);
30 30
31CREATE TABLE `member_meronymy` ( 31CREATE TABLE `member_meronymy` (
32 `meronym_id` INTEGER NOT NULL, 32 `meronym_id` INTEGER NOT NULL,
33 `holonym_id` INTEGER NOT NULL 33 `holonym_id` INTEGER NOT NULL,
34); 34 PRIMARY KEY (`meronym_id`,`holonym_id`)
35) WITHOUT ROWID;
35 36
36CREATE INDEX `member_holonym_of` ON `member_meronymy`(`meronym_id`,`holonym_id`); 37CREATE INDEX `reverse_member_meronymy` ON `member_meronymy`(`holonym_id`,`meronym_id`);
37CREATE INDEX `member_meronym_of` ON `member_meronymy`(`holonym_id`,`meronym_id`);
38 38
39CREATE TABLE `part_meronymy` ( 39CREATE TABLE `part_meronymy` (
40 `meronym_id` INTEGER NOT NULL, 40 `meronym_id` INTEGER NOT NULL,
41 `holonym_id` INTEGER NOT NULL 41 `holonym_id` INTEGER NOT NULL,
42); 42 PRIMARY KEY (`meronym_id`,`holonym_id`)
43) WITHOUT ROWID;
43 44
44CREATE INDEX `part_holonym_of` ON `part_meronymy`(`meronym_id`,`holonym_id`); 45CREATE INDEX `reverse_part_meronymy` ON `part_meronymy`(`holonym_id`,`meronym_id`);
45CREATE INDEX `part_meronym_of` ON `part_meronymy`(`holonym_id`,`meronym_id`);
46 46
47CREATE TABLE `substance_meronymy` ( 47CREATE TABLE `substance_meronymy` (
48 `meronym_id` INTEGER NOT NULL, 48 `meronym_id` INTEGER NOT NULL,
49 `holonym_id` INTEGER NOT NULL 49 `holonym_id` INTEGER NOT NULL,
50); 50 PRIMARY KEY (`meronym_id`,`holonym_id`)
51) WITHOUT ROWID;
51 52
52CREATE INDEX `substance_holonym_of` ON `substance_meronymy`(`meronym_id`,`holonym_id`); 53CREATE INDEX `reverse_substance_meronymy` ON `substance_meronymy`(`holonym_id`,`meronym_id`);
53CREATE INDEX `substance_meronym_of` ON `substance_meronymy`(`holonym_id`,`meronym_id`);
54 54
55CREATE TABLE `variation` ( 55CREATE TABLE `variation` (
56 `noun_id` INTEGER NOT NULL, 56 `noun_id` INTEGER NOT NULL,
57 `adjective_id` INTEGER NOT NULL 57 `adjective_id` INTEGER NOT NULL,
58); 58 PRIMARY KEY (`noun_id`,`adjective_id`)
59) WITHOUT ROWID;
59 60
60CREATE INDEX `variant_of` ON `variation`(`noun_id`,`adjective_id`); 61CREATE INDEX `reverse_variation` ON `variation`(`adjective_id`,`noun_id`);
61CREATE INDEX `attribute_of` ON `variation`(`adjective_id`,`noun_id`);
62 62
63CREATE TABLE `similarity` ( 63CREATE TABLE `similarity` (
64 `adjective_1_id` INTEGER NOT NULL, 64 `adjective_1_id` INTEGER NOT NULL,
65 `adjective_2_id` INTEGER NOT NULL 65 `adjective_2_id` INTEGER NOT NULL,
66); 66 PRIMARY KEY (`adjective_1_id`,`adjective_2_id`)
67 67) WITHOUT ROWID;
68CREATE INDEX `similar_to` ON `similarity`(`adjective_1_id`,`adjective_2_id`);
69 68
70CREATE TABLE `is_a` ( 69CREATE TABLE `is_a` (
71 `notion_id` INTEGER NOT NULL, 70 `notion_id` INTEGER NOT NULL,
72 `groupname` VARCHAR(32) NOT NULL 71 `groupname` VARCHAR(32) NOT NULL,
73); 72 PRIMARY KEY (`notion_id`,`groupname`)
73) WITHOUT ROWID;
74 74
75CREATE TABLE `entailment` ( 75CREATE TABLE `entailment` (
76 `given_id` INTEGER NOT NULL, 76 `given_id` INTEGER NOT NULL,
77 `entailment_id` INTEGER NOT NULL 77 `entailment_id` INTEGER NOT NULL,
78); 78 PRIMARY KEY (`given_id`,`entailment_id`)
79) WITHOUT ROWID;
79 80
80CREATE INDEX `entailment_of` ON `entailment`(`given_id`,`entailment_id`); 81CREATE INDEX `reverse_entailment` ON `entailment`(`entailment_id`,`given_id`);
81CREATE INDEX `entailed_by` ON `entailment`(`entailment_id`,`given_id`);
82 82
83CREATE TABLE `causality` ( 83CREATE TABLE `causality` (
84 `cause_id` INTEGER NOT NULL, 84 `cause_id` INTEGER NOT NULL,
85 `effect_id` INTEGER NOT NULL 85 `effect_id` INTEGER NOT NULL,
86); 86 PRIMARY KEY (`cause_id`,`effect_id`)
87) WITHOUT ROWID;
87 88
88CREATE INDEX `effect_of` ON `causality`(`cause_id`,`effect_id`); 89CREATE INDEX `reverse_causality` ON `causality`(`effect_id`,`cause_id`);
89CREATE INDEX `cause_of` ON `causality`(`effect_id`,`cause_id`);
90 90
91CREATE TABLE `words` ( 91CREATE TABLE `words` (
92 `word_id` INTEGER PRIMARY KEY, 92 `word_id` INTEGER PRIMARY KEY,
@@ -103,58 +103,57 @@ CREATE INDEX `group_words` ON `words`(`group_id`);
103 103
104CREATE TABLE `antonymy` ( 104CREATE TABLE `antonymy` (
105 `antonym_1_id` INTEGER NOT NULL, 105 `antonym_1_id` INTEGER NOT NULL,
106 `antonym_2_id` INTEGER NOT NULL 106 `antonym_2_id` INTEGER NOT NULL,
107); 107 PRIMARY KEY (`antonym_1_id`,`antonym_2_id`)
108 108) WITHOUT ROWID;
109CREATE INDEX `antonym_of` ON `antonymy`(`antonym_1_id`,`antonym_2_id`);
110 109
111CREATE TABLE `specification` ( 110CREATE TABLE `specification` (
112 `general_id` INTEGER NOT NULL, 111 `general_id` INTEGER NOT NULL,
113 `specific_id` INTEGER NOT NULL 112 `specific_id` INTEGER NOT NULL,
114); 113 PRIMARY KEY (`general_id`,`specific_id`)
114) WITHOUT ROWID;
115 115
116CREATE INDEX `specification_of` ON `specification`(`general_id`,`specific_id`); 116CREATE INDEX `reverse_specification` ON `specification`(`specific_id`,`general_id`);
117CREATE INDEX `generalization_of` ON `specification`(`specific_id`,`general_id`);
118 117
119CREATE TABLE `pertainymy` ( 118CREATE TABLE `pertainymy` (
120 `noun_id` INTEGER NOT NULL, 119 `noun_id` INTEGER NOT NULL,
121 `pertainym_id` INTEGER NOT NULL 120 `pertainym_id` INTEGER NOT NULL,
122); 121 PRIMARY KEY (`noun_id`,`pertainym_id`)
122) WITHOUT ROWID;
123 123
124CREATE INDEX `pertainym_of` ON `pertainymy`(`noun_id`,`pertainym_id`); 124CREATE INDEX `reverse_pertainymy` ON `pertainymy`(`pertainym_id`,`noun_id`);
125CREATE INDEX `anti_pertainym_of` ON `pertainymy`(`pertainym_id`,`noun_id`);
126 125
127CREATE TABLE `mannernymy` ( 126CREATE TABLE `mannernymy` (
128 `adjective_id` INTEGER NOT NULL, 127 `adjective_id` INTEGER NOT NULL,
129 `mannernym_id` INTEGER NOT NULL 128 `mannernym_id` INTEGER NOT NULL,
130); 129 PRIMARY KEY (`adjective_id`,`mannernym_id`)
130) WITHOUT ROWID;
131 131
132CREATE INDEX `mannernym_of` ON `mannernymy`(`adjective_id`,`mannernym_id`); 132CREATE INDEX `reverse_mannernymy` ON `mannernymy`(`mannernym_id`,`adjective_id`);
133CREATE INDEX `anti_mannernym_of` ON `mannernymy`(`mannernym_id`,`adjective_id`);
134 133
135CREATE TABLE `usage` ( 134CREATE TABLE `usage` (
136 `domain_id` INTEGER NOT NULL, 135 `domain_id` INTEGER NOT NULL,
137 `term_id` INTEGER NOT NULL 136 `term_id` INTEGER NOT NULL,
138); 137 PRIMARY KEY (`domain_id`,`term_id`)
138) WITHOUT ROWID;
139 139
140CREATE INDEX `usage_term_of` ON `usage`(`domain_id`,`term_id`); 140CREATE INDEX `reverse_usage` ON `usage`(`term_id`,`domain_id`);
141CREATE INDEX `usage_domain_of` ON `usage`(`term_id`,`domain_id`);
142 141
143CREATE TABLE `topicality` ( 142CREATE TABLE `topicality` (
144 `domain_id` INTEGER NOT NULL, 143 `domain_id` INTEGER NOT NULL,
145 `term_id` INTEGER NOT NULL 144 `term_id` INTEGER NOT NULL,
146); 145 PRIMARY KEY (`domain_id`,`term_id`)
146) WITHOUT ROWID;
147 147
148CREATE INDEX `topical_term_of` ON `topicality`(`domain_id`,`term_id`); 148CREATE INDEX `reverse_topicality` ON `topicality`(`term_id`,`domain_id`);
149CREATE INDEX `topical_domain_of` ON `topicality`(`term_id`,`domain_id`);
150 149
151CREATE TABLE `regionality` ( 150CREATE TABLE `regionality` (
152 `domain_id` INTEGER NOT NULL, 151 `domain_id` INTEGER NOT NULL,
153 `term_id` INTEGER NOT NULL 152 `term_id` INTEGER NOT NULL,
154); 153 PRIMARY KEY (`domain_id`,`term_id`)
154) WITHOUT ROWID;
155 155
156CREATE INDEX `regional_term_of` ON `regionality`(`domain_id`,`term_id`); 156CREATE INDEX `reverse_regionality` ON `regionality`(`term_id`,`domain_id`);
157CREATE INDEX `regional_domain_of` ON `regionality`(`term_id`,`domain_id`);
158 157
159CREATE TABLE `forms` ( 158CREATE TABLE `forms` (
160 `form_id` INTEGER PRIMARY KEY, 159 `form_id` INTEGER PRIMARY KEY,
@@ -169,11 +168,11 @@ CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`);
169CREATE TABLE `lemmas_forms` ( 168CREATE TABLE `lemmas_forms` (
170 `lemma_id` INTEGER NOT NULL, 169 `lemma_id` INTEGER NOT NULL,
171 `form_id` INTEGER NOT NULL, 170 `form_id` INTEGER NOT NULL,
172 `category` SMALLINT NOT NULL 171 `category` SMALLINT NOT NULL,
173); 172 PRIMARY KEY (`lemma_id`,`category`,`form_id`)
173) WITHOUT ROWID;
174 174
175CREATE INDEX `form_of` ON `lemmas_forms`(`lemma_id`,`category`,`form_id`); 175CREATE INDEX `forms_lemmas` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`);
176CREATE INDEX `lemma_of` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`);
177 176
178CREATE TABLE `pronunciations` ( 177CREATE TABLE `pronunciations` (
179 `pronunciation_id` INTEGER PRIMARY KEY, 178 `pronunciation_id` INTEGER PRIMARY KEY,
@@ -188,11 +187,11 @@ CREATE INDEX `rhymes_with` ON `pronunciations`(`rhyme`,`prerhyme`);
188 187
189CREATE TABLE `forms_pronunciations` ( 188CREATE TABLE `forms_pronunciations` (
190 `form_id` INTEGER NOT NULL, 189 `form_id` INTEGER NOT NULL,
191 `pronunciation_id` INTEGER NOT NULL 190 `pronunciation_id` INTEGER NOT NULL,
192); 191 PRIMARY KEY (`form_id`,`pronunciation_id`)
192) WITHOUT ROWID;
193 193
194CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`,`pronunciation_id`); 194CREATE INDEX `pronunciations_forms` ON `forms_pronunciations`(`pronunciation_id`,`form_id`);
195CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`,`form_id`);
196 195
197CREATE TABLE `frames` ( 196CREATE TABLE `frames` (
198 `frame_id` INTEGER NOT NULL, 197 `frame_id` INTEGER NOT NULL,
diff --git a/lib/version.h b/lib/version.h index 41ab79e..0404f5f 100644 --- a/lib/version.h +++ b/lib/version.h
@@ -4,7 +4,7 @@
4namespace verbly { 4namespace verbly {
5 5
6 const int DATABASE_MAJOR_VERSION = 1; 6 const int DATABASE_MAJOR_VERSION = 1;
7 const int DATABASE_MINOR_VERSION = 0; 7 const int DATABASE_MINOR_VERSION = 1;
8 8
9}; 9};
10 10