summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp270
1 files changed, 267 insertions, 3 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index ad665a2..fdea10f 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -6,6 +6,9 @@
6#include <fstream> 6#include <fstream>
7#include <hkutil/string.h> 7#include <hkutil/string.h>
8#include <hkutil/progress.h> 8#include <hkutil/progress.h>
9#include <array>
10#include <mutex>
11#include <thread>
9#include "role.h" 12#include "role.h"
10#include "part.h" 13#include "part.h"
11#include "../lib/enums.h" 14#include "../lib/enums.h"
@@ -83,7 +86,7 @@ namespace verbly {
83 readAdjectivePositioning(); 86 readAdjectivePositioning();
84 87
85 // Counts the number of URLs ImageNet has per notion 88 // Counts the number of URLs ImageNet has per notion
86 readImageNetUrls(); 89 //readImageNetUrls();
87 90
88 // Creates a word by WordNet sense key lookup table 91 // Creates a word by WordNet sense key lookup table
89 readWordNetSenseKeys(); 92 readWordNetSenseKeys();
@@ -115,9 +118,17 @@ namespace verbly {
115 // Writes the database version 118 // Writes the database version
116 writeVersion(); 119 writeVersion();
117 120
121 // Calculates and writes form merography
122 writeMerography();
123
124 // Calculates and writes pronunciation merophony
125 writeMerophony();
126
118 // Dumps data to the database 127 // Dumps data to the database
119 dumpObjects(); 128 dumpObjects();
120 129
130
131
121 // Populates the antonymy relationship from WordNet 132 // Populates the antonymy relationship from WordNet
122 readWordNetAntonymy(); 133 readWordNetAntonymy();
123 134
@@ -577,9 +588,29 @@ namespace verbly {
577 pronunciation& p = *pronunciationByPhonemes_[phonemes]; 588 pronunciation& p = *pronunciationByPhonemes_[phonemes];
578 formByText_.at(canonical)->addPronunciation(p); 589 formByText_.at(canonical)->addPronunciation(p);
579 } else { 590 } else {
580 pronunciations_.emplace_back(phonemes); 591 std::string stressless;
592 for (int i=0; i<phonemes.size(); i++) {
593 if (!std::isdigit(phonemes[i])) {
594 stressless.push_back(phonemes[i]);
595 }
596 }
597 auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " ");
598 std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
599 std::sort(stresslessList.begin(), stresslessList.end());
600 std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
601
602 int anaphoneSetId;
603 if (anaphoneSets_.count(sortedPhonemes)) {
604 anaphoneSetId = anaphoneSets_[sortedPhonemes];
605 } else {
606 anaphoneSetId = anaphoneSets_.size();
607 anaphoneSets_[sortedPhonemes] = anaphoneSetId;
608 }
609
610 pronunciations_.emplace_back(phonemes, anaphoneSetId);
581 pronunciation& p = pronunciations_.back(); 611 pronunciation& p = pronunciations_.back();
582 pronunciationByPhonemes_[phonemes] = &p; 612 pronunciationByPhonemes_[phonemes] = &p;
613 pronunciationByBlankPhonemes_[stresslessPhonemes] = &p;
583 formByText_.at(canonical)->addPronunciation(p); 614 formByText_.at(canonical)->addPronunciation(p);
584 } 615 }
585 } 616 }
@@ -671,6 +702,12 @@ namespace verbly {
671 702
672 for (form& f : forms_) 703 for (form& f : forms_)
673 { 704 {
705 std::string reverseText = f.getText();
706 std::reverse(reverseText.begin(), reverseText.end());
707 if (formByText_.count(reverseText)) {
708 f.setReverseId(formByText_[reverseText]->getId());
709 }
710
674 db_ << f; 711 db_ << f;
675 712
676 ppgs.update(); 713 ppgs.update();
@@ -682,6 +719,19 @@ namespace verbly {
682 719
683 for (pronunciation& p : pronunciations_) 720 for (pronunciation& p : pronunciations_)
684 { 721 {
722 std::string stressless;
723 for (int i=0; i<p.getPhonemes().size(); i++) {
724 if (!std::isdigit(p.getPhonemes()[i])) {
725 stressless.push_back(p.getPhonemes()[i]);
726 }
727 }
728 auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " ");
729 std::reverse(stresslessList.begin(), stresslessList.end());
730 std::string reversedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
731 if (pronunciationByBlankPhonemes_.count(reversedPhonemes)) {
732 p.setReverseId(pronunciationByBlankPhonemes_[reversedPhonemes]->getId());
733 }
734
685 db_ << p; 735 db_ << p;
686 736
687 ppgs.update(); 737 ppgs.update();
@@ -698,6 +748,208 @@ namespace verbly {
698 ppgs.update(); 748 ppgs.update();
699 } 749 }
700 } 750 }
751
752 /*{
753 hatkirby::progress ppgs("Writing merography...", formByText_.size());
754
755 for (const auto& [merotext, meroform] : formByText_)
756 {
757 for (const auto& [holotext, holoform] : formByText_)
758 {
759 if (isMero(merotext, holotext))
760 {
761 db_.insertIntoTable(
762 "merography",
763 {
764 { "merograph_id", meroform->getId() },
765 { "holograph_id", holoform->getId() }
766 });
767 }
768 }
769
770 ppgs.update();
771 }
772 }
773
774 {
775 hatkirby::progress ppgs("Writing merophony...", pronunciationByBlankPhonemes_.size());
776
777 for (const auto& [merotext, merop] : pronunciationByBlankPhonemes_)
778 {
779 auto merophonemes = hatkirby::split<std::list<std::string>>(merotext, " ");
780
781 for (const auto& [holotext, holop] : pronunciationByBlankPhonemes_)
782 {
783 auto holophonemes = hatkirby::split<std::list<std::string>>(holotext, " ");
784
785 if (isMero(merophonemes, holophonemes))
786 {
787 db_.insertIntoTable(
788 "merophony",
789 {
790 { "merophone_id", merop->getId() },
791 { "holophone_id", holop->getId() }
792 });
793 }
794 }
795
796 ppgs.update();
797 }
798 }*/
799 }
800
801 void generator::writeMerography()
802 {
803 hatkirby::progress ppgs("Writing merography...", formByText_.size());
804 for (const auto& [text, form] : formByText_)
805 {
806 ppgs.update();
807
808 std::unordered_set<std::string> visited;
809 for (int i=0; i<text.size(); i++)
810 {
811 for (int l=3; l<text.size()-i; l++)
812 {
813 if (i==0 && l == text.size())
814 {
815 continue;
816 }
817
818 std::string substr = text.substr(i, l);
819 if (formByText_.count(substr) && !visited.count(substr))
820 {
821 visited.insert(substr);
822 db_.insertIntoTable(
823 "merography",
824 {
825 { "merograph_id", formByText_[substr]->getId() },
826 { "holograph_id", form->getId() }
827 });
828 }
829 }
830 }
831
832
833 /*
834 std::string front = text;
835 while (front.size() > 2)
836 {
837 front.erase(0, 1);
838
839 if (formByText_.count(front))
840 {
841 visited.insert(front);
842 db_.insertIntoTable(
843 "merography",
844 {
845 { "merograph_id", formByText_[front]->getId() },
846 { "holograph_id", form->getId() }
847 });
848 }
849 }
850
851 if (text.size() > 2)
852 {
853 std::string back = text;
854
855 while (back.size() > 2)
856 {
857 back.pop_back();
858
859 if (formByText_.count(back) && !visited.count(back))
860 {
861 db_.insertIntoTable(
862 "merography",
863 {
864 { "merograph_id", formByText_[back]->getId() },
865 { "holograph_id", form->getId() }
866 });
867 }
868 }
869 }*/
870 }
871 }
872
873 void generator::writeMerophony()
874 {
875 std::map<std::list<std::string>, pronunciation*> tokenized;
876 for (const auto& [phonemes, pronunciation] : pronunciationByBlankPhonemes_)
877 {
878 tokenized[hatkirby::split<std::list<std::string>>(phonemes, " ")] = pronunciation;
879 }
880
881 hatkirby::progress ppgs("Writing merophony...", tokenized.size());
882 for (const auto& [phonemes, pronunciation] : tokenized)
883 {
884 ppgs.update();
885
886 std::set<std::list<std::string>> visited;
887 for (int i=0; i<phonemes.size(); i++)
888 {
889 for (int l=2; l<phonemes.size()-i; l++)
890 {
891 if (i==0 && l == phonemes.size())
892 {
893 continue;
894 }
895
896 std::list<std::string> sublist;
897 for (auto j=std::next(phonemes.begin(),i); j!=std::next(phonemes.begin(),i+l); j++)
898 {
899 sublist.push_back(*j);
900 }
901
902 if (tokenized.count(sublist) && !visited.count(sublist))
903 {
904 visited.insert(sublist);
905 db_.insertIntoTable(
906 "merophony",
907 {
908 { "merophone_id", tokenized[sublist]->getId() },
909 { "holophone_id", pronunciation->getId() }
910 });
911 }
912 }
913 }
914 /*std::list<std::string> front = phonemes;
915 while (front.size() > 1)
916 {
917 front.pop_front();
918
919 if (tokenized.count(front))
920 {
921 visited.insert(front);
922 db_.insertIntoTable(
923 "merophony",
924 {
925 { "merophone_id", tokenized[front]->getId() },
926 { "holophone_id", pronunciation->getId() }
927 });
928 break;
929 }
930 }
931
932 if (phonemes.size() > 1)
933 {
934 std::list<std::string> back = phonemes;
935
936 while (back.size() > 1)
937 {
938 back.pop_back();
939
940 if (tokenized.count(back) && !visited.count(back))
941 {
942 db_.insertIntoTable(
943 "merophony",
944 {
945 { "merophone_id", tokenized[back]->getId() },
946 { "holophone_id", pronunciation->getId() }
947 });
948 break;
949 }
950 }
951 }*/
952 }
701 } 953 }
702 954
703 void generator::readWordNetAntonymy() 955 void generator::readWordNetAntonymy()
@@ -1316,7 +1568,19 @@ namespace verbly {
1316 { 1568 {
1317 if (!formByText_.count(text)) 1569 if (!formByText_.count(text))
1318 { 1570 {
1319 forms_.emplace_back(text); 1571 std::string sortedText = text;
1572 std::sort(sortedText.begin(), sortedText.end());
1573
1574 int anagramSetId;
1575 if (anagramSets_.count(sortedText))
1576 {
1577 anagramSetId = anagramSets_[sortedText];
1578 } else {
1579 anagramSetId = anagramSets_.size();
1580 anagramSets_[sortedText] = anagramSetId;
1581 }
1582
1583 forms_.emplace_back(text, anagramSetId);
1320 formByText_[text] = &forms_.back(); 1584 formByText_[text] = &forms_.back();
1321 } 1585 }
1322 1586