summary refs log tree commit diff stats
path: root/generator
diff options
context:
space:
mode:
Diffstat (limited to 'generator')
-rw-r--r--generator/CMakeLists.txt2
-rw-r--r--generator/form.cpp9
-rw-r--r--generator/form.h19
-rw-r--r--generator/generator.cpp270
-rw-r--r--generator/generator.h11
-rw-r--r--generator/pronunciation.cpp7
-rw-r--r--generator/pronunciation.h19
-rw-r--r--generator/schema.sql24
8 files changed, 347 insertions, 14 deletions
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 8c070d2..5d2f977 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt
@@ -11,6 +11,6 @@ include_directories(
11 ../vendor/hkutil) 11 ../vendor/hkutil)
12 12
13add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp generator.cpp main.cpp) 13add_executable(generator notion.cpp word.cpp lemma.cpp form.cpp pronunciation.cpp group.cpp frame.cpp part.cpp generator.cpp main.cpp)
14set_property(TARGET generator PROPERTY CXX_STANDARD 11) 14set_property(TARGET generator PROPERTY CXX_STANDARD 17)
15set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) 15set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON)
16target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) 16target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES})
diff --git a/generator/form.cpp b/generator/form.cpp index c66820c..a88363b 100644 --- a/generator/form.cpp +++ b/generator/form.cpp
@@ -9,12 +9,13 @@ namespace verbly {
9 9
10 int form::nextId_ = 0; 10 int form::nextId_ = 0;
11 11
12 form::form(std::string text) : 12 form::form(std::string text, int anagram_set_id) :
13 id_(nextId_++), 13 id_(nextId_++),
14 text_(text), 14 text_(text),
15 complexity_(std::count(std::begin(text), std::end(text), ' ') + 1), 15 complexity_(std::count(std::begin(text), std::end(text), ' ') + 1),
16 proper_(std::any_of(std::begin(text), std::end(text), ::isupper)), 16 proper_(std::any_of(std::begin(text), std::end(text), ::isupper)),
17 length_(text.length()) 17 length_(text.length()),
18 anagram_set_id_(anagram_set_id)
18 { 19 {
19 } 20 }
20 21
@@ -34,7 +35,9 @@ namespace verbly {
34 { "form", arg.getText() }, 35 { "form", arg.getText() },
35 { "complexity", arg.getComplexity() }, 36 { "complexity", arg.getComplexity() },
36 { "proper", arg.isProper() }, 37 { "proper", arg.isProper() },
37 { "length", arg.getLength() } 38 { "length", arg.getLength() },
39 { "anagram_set_id", arg.getAnagramSetId() },
40 { "reverse_form_id", arg.getReverseId() }
38 }); 41 });
39 } 42 }
40 43
diff --git a/generator/form.h b/generator/form.h index f3dd779..c83bbdc 100644 --- a/generator/form.h +++ b/generator/form.h
@@ -15,7 +15,7 @@ namespace verbly {
15 15
16 // Constructor 16 // Constructor
17 17
18 explicit form(std::string text); 18 form(std::string text, int anagram_set_id);
19 19
20 // Mutators 20 // Mutators
21 21
@@ -48,6 +48,21 @@ namespace verbly {
48 return length_; 48 return length_;
49 } 49 }
50 50
51 int getAnagramSetId() const
52 {
53 return anagram_set_id_;
54 }
55
56 void setReverseId(int id)
57 {
58 reverse_id_ = id;
59 }
60
61 int getReverseId() const
62 {
63 return reverse_id_;
64 }
65
51 std::set<const pronunciation*> getPronunciations() const 66 std::set<const pronunciation*> getPronunciations() const
52 { 67 {
53 return pronunciations_; 68 return pronunciations_;
@@ -62,6 +77,8 @@ namespace verbly {
62 const int complexity_; 77 const int complexity_;
63 const bool proper_; 78 const bool proper_;
64 const int length_; 79 const int length_;
80 const int anagram_set_id_;
81 int reverse_id_ = -1;
65 82
66 std::set<const pronunciation*> pronunciations_; 83 std::set<const pronunciation*> pronunciations_;
67 84
diff --git a/generator/generator.cpp b/generator/generator.cpp index ad665a2..fdea10f 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -6,6 +6,9 @@
6#include <fstream> 6#include <fstream>
7#include <hkutil/string.h> 7#include <hkutil/string.h>
8#include <hkutil/progress.h> 8#include <hkutil/progress.h>
9#include <array>
10#include <mutex>
11#include <thread>
9#include "role.h" 12#include "role.h"
10#include "part.h" 13#include "part.h"
11#include "../lib/enums.h" 14#include "../lib/enums.h"
@@ -83,7 +86,7 @@ namespace verbly {
83 readAdjectivePositioning(); 86 readAdjectivePositioning();
84 87
85 // Counts the number of URLs ImageNet has per notion 88 // Counts the number of URLs ImageNet has per notion
86 readImageNetUrls(); 89 //readImageNetUrls();
87 90
88 // Creates a word by WordNet sense key lookup table 91 // Creates a word by WordNet sense key lookup table
89 readWordNetSenseKeys(); 92 readWordNetSenseKeys();
@@ -115,9 +118,17 @@ namespace verbly {
115 // Writes the database version 118 // Writes the database version
116 writeVersion(); 119 writeVersion();
117 120
121 // Calculates and writes form merography
122 writeMerography();
123
124 // Calculates and writes pronunciation merophony
125 writeMerophony();
126
118 // Dumps data to the database 127 // Dumps data to the database
119 dumpObjects(); 128 dumpObjects();
120 129
130
131
121 // Populates the antonymy relationship from WordNet 132 // Populates the antonymy relationship from WordNet
122 readWordNetAntonymy(); 133 readWordNetAntonymy();
123 134
@@ -577,9 +588,29 @@ namespace verbly {
577 pronunciation& p = *pronunciationByPhonemes_[phonemes]; 588 pronunciation& p = *pronunciationByPhonemes_[phonemes];
578 formByText_.at(canonical)->addPronunciation(p); 589 formByText_.at(canonical)->addPronunciation(p);
579 } else { 590 } else {
580 pronunciations_.emplace_back(phonemes); 591 std::string stressless;
592 for (int i=0; i<phonemes.size(); i++) {
593 if (!std::isdigit(phonemes[i])) {
594 stressless.push_back(phonemes[i]);
595 }
596 }
597 auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " ");
598 std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
599 std::sort(stresslessList.begin(), stresslessList.end());
600 std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
601
602 int anaphoneSetId;
603 if (anaphoneSets_.count(sortedPhonemes)) {
604 anaphoneSetId = anaphoneSets_[sortedPhonemes];
605 } else {
606 anaphoneSetId = anaphoneSets_.size();
607 anaphoneSets_[sortedPhonemes] = anaphoneSetId;
608 }
609
610 pronunciations_.emplace_back(phonemes, anaphoneSetId);
581 pronunciation& p = pronunciations_.back(); 611 pronunciation& p = pronunciations_.back();
582 pronunciationByPhonemes_[phonemes] = &p; 612 pronunciationByPhonemes_[phonemes] = &p;
613 pronunciationByBlankPhonemes_[stresslessPhonemes] = &p;
583 formByText_.at(canonical)->addPronunciation(p); 614 formByText_.at(canonical)->addPronunciation(p);
584 } 615 }
585 } 616 }
@@ -671,6 +702,12 @@ namespace verbly {
671 702
672 for (form& f : forms_) 703 for (form& f : forms_)
673 { 704 {
705 std::string reverseText = f.getText();
706 std::reverse(reverseText.begin(), reverseText.end());
707 if (formByText_.count(reverseText)) {
708 f.setReverseId(formByText_[reverseText]->getId());
709 }
710
674 db_ << f; 711 db_ << f;
675 712
676 ppgs.update(); 713 ppgs.update();
@@ -682,6 +719,19 @@ namespace verbly {
682 719
683 for (pronunciation& p : pronunciations_) 720 for (pronunciation& p : pronunciations_)
684 { 721 {
722 std::string stressless;
723 for (int i=0; i<p.getPhonemes().size(); i++) {
724 if (!std::isdigit(p.getPhonemes()[i])) {
725 stressless.push_back(p.getPhonemes()[i]);
726 }
727 }
728 auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " ");
729 std::reverse(stresslessList.begin(), stresslessList.end());
730 std::string reversedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
731 if (pronunciationByBlankPhonemes_.count(reversedPhonemes)) {
732 p.setReverseId(pronunciationByBlankPhonemes_[reversedPhonemes]->getId());
733 }
734
685 db_ << p; 735 db_ << p;
686 736
687 ppgs.update(); 737 ppgs.update();
@@ -698,6 +748,208 @@ namespace verbly {
698 ppgs.update(); 748 ppgs.update();
699 } 749 }
700 } 750 }
751
752 /*{
753 hatkirby::progress ppgs("Writing merography...", formByText_.size());
754
755 for (const auto& [merotext, meroform] : formByText_)
756 {
757 for (const auto& [holotext, holoform] : formByText_)
758 {
759 if (isMero(merotext, holotext))
760 {
761 db_.insertIntoTable(
762 "merography",
763 {
764 { "merograph_id", meroform->getId() },
765 { "holograph_id", holoform->getId() }
766 });
767 }
768 }
769
770 ppgs.update();
771 }
772 }
773
774 {
775 hatkirby::progress ppgs("Writing merophony...", pronunciationByBlankPhonemes_.size());
776
777 for (const auto& [merotext, merop] : pronunciationByBlankPhonemes_)
778 {
779 auto merophonemes = hatkirby::split<std::list<std::string>>(merotext, " ");
780
781 for (const auto& [holotext, holop] : pronunciationByBlankPhonemes_)
782 {
783 auto holophonemes = hatkirby::split<std::list<std::string>>(holotext, " ");
784
785 if (isMero(merophonemes, holophonemes))
786 {
787 db_.insertIntoTable(
788 "merophony",
789 {
790 { "merophone_id", merop->getId() },
791 { "holophone_id", holop->getId() }
792 });
793 }
794 }
795
796 ppgs.update();
797 }
798 }*/
799 }
800
801 void generator::writeMerography()
802 {
803 hatkirby::progress ppgs("Writing merography...", formByText_.size());
804 for (const auto& [text, form] : formByText_)
805 {
806 ppgs.update();
807
808 std::unordered_set<std::string> visited;
809 for (int i=0; i<text.size(); i++)
810 {
811 for (int l=3; l<text.size()-i; l++)
812 {
813 if (i==0 && l == text.size())
814 {
815 continue;
816 }
817
818 std::string substr = text.substr(i, l);
819 if (formByText_.count(substr) && !visited.count(substr))
820 {
821 visited.insert(substr);
822 db_.insertIntoTable(
823 "merography",
824 {
825 { "merograph_id", formByText_[substr]->getId() },
826 { "holograph_id", form->getId() }
827 });
828 }
829 }
830 }
831
832
833 /*
834 std::string front = text;
835 while (front.size() > 2)
836 {
837 front.erase(0, 1);
838
839 if (formByText_.count(front))
840 {
841 visited.insert(front);
842 db_.insertIntoTable(
843 "merography",
844 {
845 { "merograph_id", formByText_[front]->getId() },
846 { "holograph_id", form->getId() }
847 });
848 }
849 }
850
851 if (text.size() > 2)
852 {
853 std::string back = text;
854
855 while (back.size() > 2)
856 {
857 back.pop_back();
858
859 if (formByText_.count(back) && !visited.count(back))
860 {
861 db_.insertIntoTable(
862 "merography",
863 {
864 { "merograph_id", formByText_[back]->getId() },
865 { "holograph_id", form->getId() }
866 });
867 }
868 }
869 }*/
870 }
871 }
872
873 void generator::writeMerophony()
874 {
875 std::map<std::list<std::string>, pronunciation*> tokenized;
876 for (const auto& [phonemes, pronunciation] : pronunciationByBlankPhonemes_)
877 {
878 tokenized[hatkirby::split<std::list<std::string>>(phonemes, " ")] = pronunciation;
879 }
880
881 hatkirby::progress ppgs("Writing merophony...", tokenized.size());
882 for (const auto& [phonemes, pronunciation] : tokenized)
883 {
884 ppgs.update();
885
886 std::set<std::list<std::string>> visited;
887 for (int i=0; i<phonemes.size(); i++)
888 {
889 for (int l=2; l<phonemes.size()-i; l++)
890 {
891 if (i==0 && l == phonemes.size())
892 {
893 continue;
894 }
895
896 std::list<std::string> sublist;
897 for (auto j=std::next(phonemes.begin(),i); j!=std::next(phonemes.begin(),i+l); j++)
898 {
899 sublist.push_back(*j);
900 }
901
902 if (tokenized.count(sublist) && !visited.count(sublist))
903 {
904 visited.insert(sublist);
905 db_.insertIntoTable(
906 "merophony",
907 {
908 { "merophone_id", tokenized[sublist]->getId() },
909 { "holophone_id", pronunciation->getId() }
910 });
911 }
912 }
913 }
914 /*std::list<std::string> front = phonemes;
915 while (front.size() > 1)
916 {
917 front.pop_front();
918
919 if (tokenized.count(front))
920 {
921 visited.insert(front);
922 db_.insertIntoTable(
923 "merophony",
924 {
925 { "merophone_id", tokenized[front]->getId() },
926 { "holophone_id", pronunciation->getId() }
927 });
928 break;
929 }
930 }
931
932 if (phonemes.size() > 1)
933 {
934 std::list<std::string> back = phonemes;
935
936 while (back.size() > 1)
937 {
938 back.pop_back();
939
940 if (tokenized.count(back) && !visited.count(back))
941 {
942 db_.insertIntoTable(
943 "merophony",
944 {
945 { "merophone_id", tokenized[back]->getId() },
946 { "holophone_id", pronunciation->getId() }
947 });
948 break;
949 }
950 }
951 }*/
952 }
701 } 953 }
702 954
703 void generator::readWordNetAntonymy() 955 void generator::readWordNetAntonymy()
@@ -1316,7 +1568,19 @@ namespace verbly {
1316 { 1568 {
1317 if (!formByText_.count(text)) 1569 if (!formByText_.count(text))
1318 { 1570 {
1319 forms_.emplace_back(text); 1571 std::string sortedText = text;
1572 std::sort(sortedText.begin(), sortedText.end());
1573
1574 int anagramSetId;
1575 if (anagramSets_.count(sortedText))
1576 {
1577 anagramSetId = anagramSets_[sortedText];
1578 } else {
1579 anagramSetId = anagramSets_.size();
1580 anagramSets_[sortedText] = anagramSetId;
1581 }
1582
1583 forms_.emplace_back(text, anagramSetId);
1320 formByText_[text] = &forms_.back(); 1584 formByText_[text] = &forms_.back();
1321 } 1585 }
1322 1586
diff --git a/generator/generator.h b/generator/generator.h index 2cd2ba9..70f0657 100644 --- a/generator/generator.h +++ b/generator/generator.h
@@ -3,6 +3,8 @@
3 3
4#include <string> 4#include <string>
5#include <map> 5#include <map>
6#include <unordered_map>
7#include <unordered_set>
6#include <list> 8#include <list>
7#include <set> 9#include <set>
8#include <libxml/parser.h> 10#include <libxml/parser.h>
@@ -64,6 +66,10 @@ namespace verbly {
64 66
65 void dumpObjects(); 67 void dumpObjects();
66 68
69 void writeMerography();
70
71 void writeMerophony();
72
67 void readWordNetAntonymy(); 73 void readWordNetAntonymy();
68 74
69 void readWordNetVariation(); 75 void readWordNetVariation();
@@ -138,8 +144,11 @@ namespace verbly {
138 std::map<std::pair<int, int>, word*> wordByWnidAndWnum_; 144 std::map<std::pair<int, int>, word*> wordByWnidAndWnum_;
139 std::map<std::string, std::set<word*>> wordsByBaseForm_; 145 std::map<std::string, std::set<word*>> wordsByBaseForm_;
140 std::map<std::string, lemma*> lemmaByBaseForm_; 146 std::map<std::string, lemma*> lemmaByBaseForm_;
141 std::map<std::string, form*> formByText_; 147 std::unordered_map<std::string, form*> formByText_;
142 std::map<std::string, pronunciation*> pronunciationByPhonemes_; 148 std::map<std::string, pronunciation*> pronunciationByPhonemes_;
149 std::unordered_map<std::string, pronunciation*> pronunciationByBlankPhonemes_;
150 std::map<std::string, int> anagramSets_;
151 std::map<std::string, int> anaphoneSets_;
143 152
144 // Caches 153 // Caches
145 154
diff --git a/generator/pronunciation.cpp b/generator/pronunciation.cpp index 3075d42..5c4d8e2 100644 --- a/generator/pronunciation.cpp +++ b/generator/pronunciation.cpp
@@ -10,9 +10,10 @@ namespace verbly {
10 10
11 int pronunciation::nextId_ = 0; 11 int pronunciation::nextId_ = 0;
12 12
13 pronunciation::pronunciation(std::string phonemes) : 13 pronunciation::pronunciation(std::string phonemes, int anaphone_set_id) :
14 id_(nextId_++), 14 id_(nextId_++),
15 phonemes_(phonemes) 15 phonemes_(phonemes),
16 anaphone_set_id_(anaphone_set_id)
16 { 17 {
17 auto phonemeList = 18 auto phonemeList =
18 hatkirby::split<std::list<std::string>>(phonemes, " "); 19 hatkirby::split<std::list<std::string>>(phonemes, " ");
@@ -88,6 +89,8 @@ namespace verbly {
88 fields.emplace_back("phonemes", arg.getPhonemes()); 89 fields.emplace_back("phonemes", arg.getPhonemes());
89 fields.emplace_back("syllables", arg.getSyllables()); 90 fields.emplace_back("syllables", arg.getSyllables());
90 fields.emplace_back("stress", arg.getStress()); 91 fields.emplace_back("stress", arg.getStress());
92 fields.emplace_back("anaphone_set_id", arg.getAnaphoneSetId());
93 fields.emplace_back("reverse_pronunciation_id", arg.getReverseId());
91 94
92 if (arg.hasRhyme()) 95 if (arg.hasRhyme())
93 { 96 {
diff --git a/generator/pronunciation.h b/generator/pronunciation.h index 3190e6d..e6dc4b4 100644 --- a/generator/pronunciation.h +++ b/generator/pronunciation.h
@@ -13,7 +13,7 @@ namespace verbly {
13 13
14 // Constructor 14 // Constructor
15 15
16 explicit pronunciation(std::string phonemes); 16 pronunciation(std::string phonemes, int anaphone_set_id);
17 17
18 // Accessors 18 // Accessors
19 19
@@ -62,6 +62,21 @@ namespace verbly {
62 return stress_; 62 return stress_;
63 } 63 }
64 64
65 int getAnaphoneSetId() const
66 {
67 return anaphone_set_id_;
68 }
69
70 void setReverseId(int id)
71 {
72 reverse_id_ = id;
73 }
74
75 int getReverseId() const
76 {
77 return reverse_id_;
78 }
79
65 private: 80 private:
66 81
67 static int nextId_; 82 static int nextId_;
@@ -72,6 +87,8 @@ namespace verbly {
72 std::string prerhyme_; 87 std::string prerhyme_;
73 int syllables_ = 0; 88 int syllables_ = 0;
74 std::string stress_; 89 std::string stress_;
90 int anaphone_set_id_;
91 int reverse_id_ = -1;
75 92
76 }; 93 };
77 94
diff --git a/generator/schema.sql b/generator/schema.sql index 6a7d223..8c910f4 100644 --- a/generator/schema.sql +++ b/generator/schema.sql
@@ -160,7 +160,9 @@ CREATE TABLE `forms` (
160 `form` VARCHAR(32) NOT NULL, 160 `form` VARCHAR(32) NOT NULL,
161 `complexity` SMALLINT NOT NULL, 161 `complexity` SMALLINT NOT NULL,
162 `proper` SMALLINT NOT NULL, 162 `proper` SMALLINT NOT NULL,
163 `length` SMALLINT NOT NULL 163 `length` SMALLINT NOT NULL,
164 `anagram_set_id` INTEGER NOT NULL,
165 `reverse_form_id` INTEGER NOT NULL
164); 166);
165 167
166CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); 168CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`);
@@ -174,13 +176,23 @@ CREATE TABLE `lemmas_forms` (
174 176
175CREATE INDEX `forms_lemmas` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`); 177CREATE INDEX `forms_lemmas` ON `lemmas_forms`(`form_id`,`category`,`lemma_id`);
176 178
179CREATE TABLE `merography` (
180 `merograph_id` INTEGER NOT NULL,
181 `holograph_id` INTEGER NOT NULL,
182 PRIMARY KEY(`merograph_id`,`holograph_id`)
183) WITHOUT ROWID;
184
185CREATE INDEX `reverse_merography` ON `merography`(`holograph_id`,`merograph_id`);
186
177CREATE TABLE `pronunciations` ( 187CREATE TABLE `pronunciations` (
178 `pronunciation_id` INTEGER PRIMARY KEY, 188 `pronunciation_id` INTEGER PRIMARY KEY,
179 `phonemes` VARCHAR(64) NOT NULL, 189 `phonemes` VARCHAR(64) NOT NULL,
180 `prerhyme` VARCHAR(8), 190 `prerhyme` VARCHAR(8),
181 `rhyme` VARCHAR(64), 191 `rhyme` VARCHAR(64),
182 `syllables` INTEGER NOT NULL, 192 `syllables` INTEGER NOT NULL,
183 `stress` VARCHAR(64) NOT NULL 193 `stress` VARCHAR(64) NOT NULL,
194 `anaphone_set_id` INTEGER NOT NULL,
195 `reverse_pronunciation_id` INTEGER NOT NULL
184); 196);
185 197
186CREATE INDEX `rhymes_with` ON `pronunciations`(`rhyme`,`prerhyme`); 198CREATE INDEX `rhymes_with` ON `pronunciations`(`rhyme`,`prerhyme`);
@@ -193,6 +205,14 @@ CREATE TABLE `forms_pronunciations` (
193 205
194CREATE INDEX `pronunciations_forms` ON `forms_pronunciations`(`pronunciation_id`,`form_id`); 206CREATE INDEX `pronunciations_forms` ON `forms_pronunciations`(`pronunciation_id`,`form_id`);
195 207
208CREATE TABLE `merophony` (
209 `merophone_id` INTEGER NOT NULL,
210 `holophone_id` INTEGER NOT NULL,
211 PRIMARY KEY(`merophone_id`,`holophone_id`)
212) WITHOUT ROWID;
213
214CREATE INDEX `reverse_merophony` ON `merophony`(`holophone_id`,`merophone_id`);
215
196CREATE TABLE `frames` ( 216CREATE TABLE `frames` (
197 `frame_id` INTEGER NOT NULL, 217 `frame_id` INTEGER NOT NULL,
198 `group_id` INTEGER NOT NULL, 218 `group_id` INTEGER NOT NULL,