diff options
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 270 |
1 files changed, 267 insertions, 3 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index ad665a2..fdea10f 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -6,6 +6,9 @@ | |||
6 | #include <fstream> | 6 | #include <fstream> |
7 | #include <hkutil/string.h> | 7 | #include <hkutil/string.h> |
8 | #include <hkutil/progress.h> | 8 | #include <hkutil/progress.h> |
9 | #include <array> | ||
10 | #include <mutex> | ||
11 | #include <thread> | ||
9 | #include "role.h" | 12 | #include "role.h" |
10 | #include "part.h" | 13 | #include "part.h" |
11 | #include "../lib/enums.h" | 14 | #include "../lib/enums.h" |
@@ -83,7 +86,7 @@ namespace verbly { | |||
83 | readAdjectivePositioning(); | 86 | readAdjectivePositioning(); |
84 | 87 | ||
85 | // Counts the number of URLs ImageNet has per notion | 88 | // Counts the number of URLs ImageNet has per notion |
86 | readImageNetUrls(); | 89 | //readImageNetUrls(); |
87 | 90 | ||
88 | // Creates a word by WordNet sense key lookup table | 91 | // Creates a word by WordNet sense key lookup table |
89 | readWordNetSenseKeys(); | 92 | readWordNetSenseKeys(); |
@@ -115,9 +118,17 @@ namespace verbly { | |||
115 | // Writes the database version | 118 | // Writes the database version |
116 | writeVersion(); | 119 | writeVersion(); |
117 | 120 | ||
121 | // Calculates and writes form merography | ||
122 | writeMerography(); | ||
123 | |||
124 | // Calculates and writes pronunciation merophony | ||
125 | writeMerophony(); | ||
126 | |||
118 | // Dumps data to the database | 127 | // Dumps data to the database |
119 | dumpObjects(); | 128 | dumpObjects(); |
120 | 129 | ||
130 | |||
131 | |||
121 | // Populates the antonymy relationship from WordNet | 132 | // Populates the antonymy relationship from WordNet |
122 | readWordNetAntonymy(); | 133 | readWordNetAntonymy(); |
123 | 134 | ||
@@ -577,9 +588,29 @@ namespace verbly { | |||
577 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; | 588 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; |
578 | formByText_.at(canonical)->addPronunciation(p); | 589 | formByText_.at(canonical)->addPronunciation(p); |
579 | } else { | 590 | } else { |
580 | pronunciations_.emplace_back(phonemes); | 591 | std::string stressless; |
592 | for (int i=0; i<phonemes.size(); i++) { | ||
593 | if (!std::isdigit(phonemes[i])) { | ||
594 | stressless.push_back(phonemes[i]); | ||
595 | } | ||
596 | } | ||
597 | auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
598 | std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
599 | std::sort(stresslessList.begin(), stresslessList.end()); | ||
600 | std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
601 | |||
602 | int anaphoneSetId; | ||
603 | if (anaphoneSets_.count(sortedPhonemes)) { | ||
604 | anaphoneSetId = anaphoneSets_[sortedPhonemes]; | ||
605 | } else { | ||
606 | anaphoneSetId = anaphoneSets_.size(); | ||
607 | anaphoneSets_[sortedPhonemes] = anaphoneSetId; | ||
608 | } | ||
609 | |||
610 | pronunciations_.emplace_back(phonemes, anaphoneSetId); | ||
581 | pronunciation& p = pronunciations_.back(); | 611 | pronunciation& p = pronunciations_.back(); |
582 | pronunciationByPhonemes_[phonemes] = &p; | 612 | pronunciationByPhonemes_[phonemes] = &p; |
613 | pronunciationByBlankPhonemes_[stresslessPhonemes] = &p; | ||
583 | formByText_.at(canonical)->addPronunciation(p); | 614 | formByText_.at(canonical)->addPronunciation(p); |
584 | } | 615 | } |
585 | } | 616 | } |
@@ -671,6 +702,12 @@ namespace verbly { | |||
671 | 702 | ||
672 | for (form& f : forms_) | 703 | for (form& f : forms_) |
673 | { | 704 | { |
705 | std::string reverseText = f.getText(); | ||
706 | std::reverse(reverseText.begin(), reverseText.end()); | ||
707 | if (formByText_.count(reverseText)) { | ||
708 | f.setReverseId(formByText_[reverseText]->getId()); | ||
709 | } | ||
710 | |||
674 | db_ << f; | 711 | db_ << f; |
675 | 712 | ||
676 | ppgs.update(); | 713 | ppgs.update(); |
@@ -682,6 +719,19 @@ namespace verbly { | |||
682 | 719 | ||
683 | for (pronunciation& p : pronunciations_) | 720 | for (pronunciation& p : pronunciations_) |
684 | { | 721 | { |
722 | std::string stressless; | ||
723 | for (int i=0; i<p.getPhonemes().size(); i++) { | ||
724 | if (!std::isdigit(p.getPhonemes()[i])) { | ||
725 | stressless.push_back(p.getPhonemes()[i]); | ||
726 | } | ||
727 | } | ||
728 | auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
729 | std::reverse(stresslessList.begin(), stresslessList.end()); | ||
730 | std::string reversedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
731 | if (pronunciationByBlankPhonemes_.count(reversedPhonemes)) { | ||
732 | p.setReverseId(pronunciationByBlankPhonemes_[reversedPhonemes]->getId()); | ||
733 | } | ||
734 | |||
685 | db_ << p; | 735 | db_ << p; |
686 | 736 | ||
687 | ppgs.update(); | 737 | ppgs.update(); |
@@ -698,6 +748,208 @@ namespace verbly { | |||
698 | ppgs.update(); | 748 | ppgs.update(); |
699 | } | 749 | } |
700 | } | 750 | } |
751 | |||
752 | /*{ | ||
753 | hatkirby::progress ppgs("Writing merography...", formByText_.size()); | ||
754 | |||
755 | for (const auto& [merotext, meroform] : formByText_) | ||
756 | { | ||
757 | for (const auto& [holotext, holoform] : formByText_) | ||
758 | { | ||
759 | if (isMero(merotext, holotext)) | ||
760 | { | ||
761 | db_.insertIntoTable( | ||
762 | "merography", | ||
763 | { | ||
764 | { "merograph_id", meroform->getId() }, | ||
765 | { "holograph_id", holoform->getId() } | ||
766 | }); | ||
767 | } | ||
768 | } | ||
769 | |||
770 | ppgs.update(); | ||
771 | } | ||
772 | } | ||
773 | |||
774 | { | ||
775 | hatkirby::progress ppgs("Writing merophony...", pronunciationByBlankPhonemes_.size()); | ||
776 | |||
777 | for (const auto& [merotext, merop] : pronunciationByBlankPhonemes_) | ||
778 | { | ||
779 | auto merophonemes = hatkirby::split<std::list<std::string>>(merotext, " "); | ||
780 | |||
781 | for (const auto& [holotext, holop] : pronunciationByBlankPhonemes_) | ||
782 | { | ||
783 | auto holophonemes = hatkirby::split<std::list<std::string>>(holotext, " "); | ||
784 | |||
785 | if (isMero(merophonemes, holophonemes)) | ||
786 | { | ||
787 | db_.insertIntoTable( | ||
788 | "merophony", | ||
789 | { | ||
790 | { "merophone_id", merop->getId() }, | ||
791 | { "holophone_id", holop->getId() } | ||
792 | }); | ||
793 | } | ||
794 | } | ||
795 | |||
796 | ppgs.update(); | ||
797 | } | ||
798 | }*/ | ||
799 | } | ||
800 | |||
801 | void generator::writeMerography() | ||
802 | { | ||
803 | hatkirby::progress ppgs("Writing merography...", formByText_.size()); | ||
804 | for (const auto& [text, form] : formByText_) | ||
805 | { | ||
806 | ppgs.update(); | ||
807 | |||
808 | std::unordered_set<std::string> visited; | ||
809 | for (int i=0; i<text.size(); i++) | ||
810 | { | ||
811 | for (int l=3; l<text.size()-i; l++) | ||
812 | { | ||
813 | if (i==0 && l == text.size()) | ||
814 | { | ||
815 | continue; | ||
816 | } | ||
817 | |||
818 | std::string substr = text.substr(i, l); | ||
819 | if (formByText_.count(substr) && !visited.count(substr)) | ||
820 | { | ||
821 | visited.insert(substr); | ||
822 | db_.insertIntoTable( | ||
823 | "merography", | ||
824 | { | ||
825 | { "merograph_id", formByText_[substr]->getId() }, | ||
826 | { "holograph_id", form->getId() } | ||
827 | }); | ||
828 | } | ||
829 | } | ||
830 | } | ||
831 | |||
832 | |||
833 | /* | ||
834 | std::string front = text; | ||
835 | while (front.size() > 2) | ||
836 | { | ||
837 | front.erase(0, 1); | ||
838 | |||
839 | if (formByText_.count(front)) | ||
840 | { | ||
841 | visited.insert(front); | ||
842 | db_.insertIntoTable( | ||
843 | "merography", | ||
844 | { | ||
845 | { "merograph_id", formByText_[front]->getId() }, | ||
846 | { "holograph_id", form->getId() } | ||
847 | }); | ||
848 | } | ||
849 | } | ||
850 | |||
851 | if (text.size() > 2) | ||
852 | { | ||
853 | std::string back = text; | ||
854 | |||
855 | while (back.size() > 2) | ||
856 | { | ||
857 | back.pop_back(); | ||
858 | |||
859 | if (formByText_.count(back) && !visited.count(back)) | ||
860 | { | ||
861 | db_.insertIntoTable( | ||
862 | "merography", | ||
863 | { | ||
864 | { "merograph_id", formByText_[back]->getId() }, | ||
865 | { "holograph_id", form->getId() } | ||
866 | }); | ||
867 | } | ||
868 | } | ||
869 | }*/ | ||
870 | } | ||
871 | } | ||
872 | |||
873 | void generator::writeMerophony() | ||
874 | { | ||
875 | std::map<std::list<std::string>, pronunciation*> tokenized; | ||
876 | for (const auto& [phonemes, pronunciation] : pronunciationByBlankPhonemes_) | ||
877 | { | ||
878 | tokenized[hatkirby::split<std::list<std::string>>(phonemes, " ")] = pronunciation; | ||
879 | } | ||
880 | |||
881 | hatkirby::progress ppgs("Writing merophony...", tokenized.size()); | ||
882 | for (const auto& [phonemes, pronunciation] : tokenized) | ||
883 | { | ||
884 | ppgs.update(); | ||
885 | |||
886 | std::set<std::list<std::string>> visited; | ||
887 | for (int i=0; i<phonemes.size(); i++) | ||
888 | { | ||
889 | for (int l=2; l<phonemes.size()-i; l++) | ||
890 | { | ||
891 | if (i==0 && l == phonemes.size()) | ||
892 | { | ||
893 | continue; | ||
894 | } | ||
895 | |||
896 | std::list<std::string> sublist; | ||
897 | for (auto j=std::next(phonemes.begin(),i); j!=std::next(phonemes.begin(),i+l); j++) | ||
898 | { | ||
899 | sublist.push_back(*j); | ||
900 | } | ||
901 | |||
902 | if (tokenized.count(sublist) && !visited.count(sublist)) | ||
903 | { | ||
904 | visited.insert(sublist); | ||
905 | db_.insertIntoTable( | ||
906 | "merophony", | ||
907 | { | ||
908 | { "merophone_id", tokenized[sublist]->getId() }, | ||
909 | { "holophone_id", pronunciation->getId() } | ||
910 | }); | ||
911 | } | ||
912 | } | ||
913 | } | ||
914 | /*std::list<std::string> front = phonemes; | ||
915 | while (front.size() > 1) | ||
916 | { | ||
917 | front.pop_front(); | ||
918 | |||
919 | if (tokenized.count(front)) | ||
920 | { | ||
921 | visited.insert(front); | ||
922 | db_.insertIntoTable( | ||
923 | "merophony", | ||
924 | { | ||
925 | { "merophone_id", tokenized[front]->getId() }, | ||
926 | { "holophone_id", pronunciation->getId() } | ||
927 | }); | ||
928 | break; | ||
929 | } | ||
930 | } | ||
931 | |||
932 | if (phonemes.size() > 1) | ||
933 | { | ||
934 | std::list<std::string> back = phonemes; | ||
935 | |||
936 | while (back.size() > 1) | ||
937 | { | ||
938 | back.pop_back(); | ||
939 | |||
940 | if (tokenized.count(back) && !visited.count(back)) | ||
941 | { | ||
942 | db_.insertIntoTable( | ||
943 | "merophony", | ||
944 | { | ||
945 | { "merophone_id", tokenized[back]->getId() }, | ||
946 | { "holophone_id", pronunciation->getId() } | ||
947 | }); | ||
948 | break; | ||
949 | } | ||
950 | } | ||
951 | }*/ | ||
952 | } | ||
701 | } | 953 | } |
702 | 954 | ||
703 | void generator::readWordNetAntonymy() | 955 | void generator::readWordNetAntonymy() |
@@ -1316,7 +1568,19 @@ namespace verbly { | |||
1316 | { | 1568 | { |
1317 | if (!formByText_.count(text)) | 1569 | if (!formByText_.count(text)) |
1318 | { | 1570 | { |
1319 | forms_.emplace_back(text); | 1571 | std::string sortedText = text; |
1572 | std::sort(sortedText.begin(), sortedText.end()); | ||
1573 | |||
1574 | int anagramSetId; | ||
1575 | if (anagramSets_.count(sortedText)) | ||
1576 | { | ||
1577 | anagramSetId = anagramSets_[sortedText]; | ||
1578 | } else { | ||
1579 | anagramSetId = anagramSets_.size(); | ||
1580 | anagramSets_[sortedText] = anagramSetId; | ||
1581 | } | ||
1582 | |||
1583 | forms_.emplace_back(text, anagramSetId); | ||
1320 | formByText_[text] = &forms_.back(); | 1584 | formByText_[text] = &forms_.back(); |
1321 | } | 1585 | } |
1322 | 1586 | ||