diff options
Diffstat (limited to 'generator/generator.cpp')
| -rw-r--r-- | generator/generator.cpp | 270 |
1 files changed, 267 insertions, 3 deletions
| diff --git a/generator/generator.cpp b/generator/generator.cpp index ad665a2..fdea10f 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -6,6 +6,9 @@ | |||
| 6 | #include <fstream> | 6 | #include <fstream> |
| 7 | #include <hkutil/string.h> | 7 | #include <hkutil/string.h> |
| 8 | #include <hkutil/progress.h> | 8 | #include <hkutil/progress.h> |
| 9 | #include <array> | ||
| 10 | #include <mutex> | ||
| 11 | #include <thread> | ||
| 9 | #include "role.h" | 12 | #include "role.h" |
| 10 | #include "part.h" | 13 | #include "part.h" |
| 11 | #include "../lib/enums.h" | 14 | #include "../lib/enums.h" |
| @@ -83,7 +86,7 @@ namespace verbly { | |||
| 83 | readAdjectivePositioning(); | 86 | readAdjectivePositioning(); |
| 84 | 87 | ||
| 85 | // Counts the number of URLs ImageNet has per notion | 88 | // Counts the number of URLs ImageNet has per notion |
| 86 | readImageNetUrls(); | 89 | //readImageNetUrls(); |
| 87 | 90 | ||
| 88 | // Creates a word by WordNet sense key lookup table | 91 | // Creates a word by WordNet sense key lookup table |
| 89 | readWordNetSenseKeys(); | 92 | readWordNetSenseKeys(); |
| @@ -115,9 +118,17 @@ namespace verbly { | |||
| 115 | // Writes the database version | 118 | // Writes the database version |
| 116 | writeVersion(); | 119 | writeVersion(); |
| 117 | 120 | ||
| 121 | // Calculates and writes form merography | ||
| 122 | writeMerography(); | ||
| 123 | |||
| 124 | // Calculates and writes pronunciation merophony | ||
| 125 | writeMerophony(); | ||
| 126 | |||
| 118 | // Dumps data to the database | 127 | // Dumps data to the database |
| 119 | dumpObjects(); | 128 | dumpObjects(); |
| 120 | 129 | ||
| 130 | |||
| 131 | |||
| 121 | // Populates the antonymy relationship from WordNet | 132 | // Populates the antonymy relationship from WordNet |
| 122 | readWordNetAntonymy(); | 133 | readWordNetAntonymy(); |
| 123 | 134 | ||
| @@ -577,9 +588,29 @@ namespace verbly { | |||
| 577 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; | 588 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; |
| 578 | formByText_.at(canonical)->addPronunciation(p); | 589 | formByText_.at(canonical)->addPronunciation(p); |
| 579 | } else { | 590 | } else { |
| 580 | pronunciations_.emplace_back(phonemes); | 591 | std::string stressless; |
| 592 | for (int i=0; i<phonemes.size(); i++) { | ||
| 593 | if (!std::isdigit(phonemes[i])) { | ||
| 594 | stressless.push_back(phonemes[i]); | ||
| 595 | } | ||
| 596 | } | ||
| 597 | auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
| 598 | std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
| 599 | std::sort(stresslessList.begin(), stresslessList.end()); | ||
| 600 | std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
| 601 | |||
| 602 | int anaphoneSetId; | ||
| 603 | if (anaphoneSets_.count(sortedPhonemes)) { | ||
| 604 | anaphoneSetId = anaphoneSets_[sortedPhonemes]; | ||
| 605 | } else { | ||
| 606 | anaphoneSetId = anaphoneSets_.size(); | ||
| 607 | anaphoneSets_[sortedPhonemes] = anaphoneSetId; | ||
| 608 | } | ||
| 609 | |||
| 610 | pronunciations_.emplace_back(phonemes, anaphoneSetId); | ||
| 581 | pronunciation& p = pronunciations_.back(); | 611 | pronunciation& p = pronunciations_.back(); |
| 582 | pronunciationByPhonemes_[phonemes] = &p; | 612 | pronunciationByPhonemes_[phonemes] = &p; |
| 613 | pronunciationByBlankPhonemes_[stresslessPhonemes] = &p; | ||
| 583 | formByText_.at(canonical)->addPronunciation(p); | 614 | formByText_.at(canonical)->addPronunciation(p); |
| 584 | } | 615 | } |
| 585 | } | 616 | } |
| @@ -671,6 +702,12 @@ namespace verbly { | |||
| 671 | 702 | ||
| 672 | for (form& f : forms_) | 703 | for (form& f : forms_) |
| 673 | { | 704 | { |
| 705 | std::string reverseText = f.getText(); | ||
| 706 | std::reverse(reverseText.begin(), reverseText.end()); | ||
| 707 | if (formByText_.count(reverseText)) { | ||
| 708 | f.setReverseId(formByText_[reverseText]->getId()); | ||
| 709 | } | ||
| 710 | |||
| 674 | db_ << f; | 711 | db_ << f; |
| 675 | 712 | ||
| 676 | ppgs.update(); | 713 | ppgs.update(); |
| @@ -682,6 +719,19 @@ namespace verbly { | |||
| 682 | 719 | ||
| 683 | for (pronunciation& p : pronunciations_) | 720 | for (pronunciation& p : pronunciations_) |
| 684 | { | 721 | { |
| 722 | std::string stressless; | ||
| 723 | for (int i=0; i<p.getPhonemes().size(); i++) { | ||
| 724 | if (!std::isdigit(p.getPhonemes()[i])) { | ||
| 725 | stressless.push_back(p.getPhonemes()[i]); | ||
| 726 | } | ||
| 727 | } | ||
| 728 | auto stresslessList = hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
| 729 | std::reverse(stresslessList.begin(), stresslessList.end()); | ||
| 730 | std::string reversedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
| 731 | if (pronunciationByBlankPhonemes_.count(reversedPhonemes)) { | ||
| 732 | p.setReverseId(pronunciationByBlankPhonemes_[reversedPhonemes]->getId()); | ||
| 733 | } | ||
| 734 | |||
| 685 | db_ << p; | 735 | db_ << p; |
| 686 | 736 | ||
| 687 | ppgs.update(); | 737 | ppgs.update(); |
| @@ -698,6 +748,208 @@ namespace verbly { | |||
| 698 | ppgs.update(); | 748 | ppgs.update(); |
| 699 | } | 749 | } |
| 700 | } | 750 | } |
| 751 | |||
| 752 | /*{ | ||
| 753 | hatkirby::progress ppgs("Writing merography...", formByText_.size()); | ||
| 754 | |||
| 755 | for (const auto& [merotext, meroform] : formByText_) | ||
| 756 | { | ||
| 757 | for (const auto& [holotext, holoform] : formByText_) | ||
| 758 | { | ||
| 759 | if (isMero(merotext, holotext)) | ||
| 760 | { | ||
| 761 | db_.insertIntoTable( | ||
| 762 | "merography", | ||
| 763 | { | ||
| 764 | { "merograph_id", meroform->getId() }, | ||
| 765 | { "holograph_id", holoform->getId() } | ||
| 766 | }); | ||
| 767 | } | ||
| 768 | } | ||
| 769 | |||
| 770 | ppgs.update(); | ||
| 771 | } | ||
| 772 | } | ||
| 773 | |||
| 774 | { | ||
| 775 | hatkirby::progress ppgs("Writing merophony...", pronunciationByBlankPhonemes_.size()); | ||
| 776 | |||
| 777 | for (const auto& [merotext, merop] : pronunciationByBlankPhonemes_) | ||
| 778 | { | ||
| 779 | auto merophonemes = hatkirby::split<std::list<std::string>>(merotext, " "); | ||
| 780 | |||
| 781 | for (const auto& [holotext, holop] : pronunciationByBlankPhonemes_) | ||
| 782 | { | ||
| 783 | auto holophonemes = hatkirby::split<std::list<std::string>>(holotext, " "); | ||
| 784 | |||
| 785 | if (isMero(merophonemes, holophonemes)) | ||
| 786 | { | ||
| 787 | db_.insertIntoTable( | ||
| 788 | "merophony", | ||
| 789 | { | ||
| 790 | { "merophone_id", merop->getId() }, | ||
| 791 | { "holophone_id", holop->getId() } | ||
| 792 | }); | ||
| 793 | } | ||
| 794 | } | ||
| 795 | |||
| 796 | ppgs.update(); | ||
| 797 | } | ||
| 798 | }*/ | ||
| 799 | } | ||
| 800 | |||
| 801 | void generator::writeMerography() | ||
| 802 | { | ||
| 803 | hatkirby::progress ppgs("Writing merography...", formByText_.size()); | ||
| 804 | for (const auto& [text, form] : formByText_) | ||
| 805 | { | ||
| 806 | ppgs.update(); | ||
| 807 | |||
| 808 | std::unordered_set<std::string> visited; | ||
| 809 | for (int i=0; i<text.size(); i++) | ||
| 810 | { | ||
| 811 | for (int l=3; l<text.size()-i; l++) | ||
| 812 | { | ||
| 813 | if (i==0 && l == text.size()) | ||
| 814 | { | ||
| 815 | continue; | ||
| 816 | } | ||
| 817 | |||
| 818 | std::string substr = text.substr(i, l); | ||
| 819 | if (formByText_.count(substr) && !visited.count(substr)) | ||
| 820 | { | ||
| 821 | visited.insert(substr); | ||
| 822 | db_.insertIntoTable( | ||
| 823 | "merography", | ||
| 824 | { | ||
| 825 | { "merograph_id", formByText_[substr]->getId() }, | ||
| 826 | { "holograph_id", form->getId() } | ||
| 827 | }); | ||
| 828 | } | ||
| 829 | } | ||
| 830 | } | ||
| 831 | |||
| 832 | |||
| 833 | /* | ||
| 834 | std::string front = text; | ||
| 835 | while (front.size() > 2) | ||
| 836 | { | ||
| 837 | front.erase(0, 1); | ||
| 838 | |||
| 839 | if (formByText_.count(front)) | ||
| 840 | { | ||
| 841 | visited.insert(front); | ||
| 842 | db_.insertIntoTable( | ||
| 843 | "merography", | ||
| 844 | { | ||
| 845 | { "merograph_id", formByText_[front]->getId() }, | ||
| 846 | { "holograph_id", form->getId() } | ||
| 847 | }); | ||
| 848 | } | ||
| 849 | } | ||
| 850 | |||
| 851 | if (text.size() > 2) | ||
| 852 | { | ||
| 853 | std::string back = text; | ||
| 854 | |||
| 855 | while (back.size() > 2) | ||
| 856 | { | ||
| 857 | back.pop_back(); | ||
| 858 | |||
| 859 | if (formByText_.count(back) && !visited.count(back)) | ||
| 860 | { | ||
| 861 | db_.insertIntoTable( | ||
| 862 | "merography", | ||
| 863 | { | ||
| 864 | { "merograph_id", formByText_[back]->getId() }, | ||
| 865 | { "holograph_id", form->getId() } | ||
| 866 | }); | ||
| 867 | } | ||
| 868 | } | ||
| 869 | }*/ | ||
| 870 | } | ||
| 871 | } | ||
| 872 | |||
| 873 | void generator::writeMerophony() | ||
| 874 | { | ||
| 875 | std::map<std::list<std::string>, pronunciation*> tokenized; | ||
| 876 | for (const auto& [phonemes, pronunciation] : pronunciationByBlankPhonemes_) | ||
| 877 | { | ||
| 878 | tokenized[hatkirby::split<std::list<std::string>>(phonemes, " ")] = pronunciation; | ||
| 879 | } | ||
| 880 | |||
| 881 | hatkirby::progress ppgs("Writing merophony...", tokenized.size()); | ||
| 882 | for (const auto& [phonemes, pronunciation] : tokenized) | ||
| 883 | { | ||
| 884 | ppgs.update(); | ||
| 885 | |||
| 886 | std::set<std::list<std::string>> visited; | ||
| 887 | for (int i=0; i<phonemes.size(); i++) | ||
| 888 | { | ||
| 889 | for (int l=2; l<phonemes.size()-i; l++) | ||
| 890 | { | ||
| 891 | if (i==0 && l == phonemes.size()) | ||
| 892 | { | ||
| 893 | continue; | ||
| 894 | } | ||
| 895 | |||
| 896 | std::list<std::string> sublist; | ||
| 897 | for (auto j=std::next(phonemes.begin(),i); j!=std::next(phonemes.begin(),i+l); j++) | ||
| 898 | { | ||
| 899 | sublist.push_back(*j); | ||
| 900 | } | ||
| 901 | |||
| 902 | if (tokenized.count(sublist) && !visited.count(sublist)) | ||
| 903 | { | ||
| 904 | visited.insert(sublist); | ||
| 905 | db_.insertIntoTable( | ||
| 906 | "merophony", | ||
| 907 | { | ||
| 908 | { "merophone_id", tokenized[sublist]->getId() }, | ||
| 909 | { "holophone_id", pronunciation->getId() } | ||
| 910 | }); | ||
| 911 | } | ||
| 912 | } | ||
| 913 | } | ||
| 914 | /*std::list<std::string> front = phonemes; | ||
| 915 | while (front.size() > 1) | ||
| 916 | { | ||
| 917 | front.pop_front(); | ||
| 918 | |||
| 919 | if (tokenized.count(front)) | ||
| 920 | { | ||
| 921 | visited.insert(front); | ||
| 922 | db_.insertIntoTable( | ||
| 923 | "merophony", | ||
| 924 | { | ||
| 925 | { "merophone_id", tokenized[front]->getId() }, | ||
| 926 | { "holophone_id", pronunciation->getId() } | ||
| 927 | }); | ||
| 928 | break; | ||
| 929 | } | ||
| 930 | } | ||
| 931 | |||
| 932 | if (phonemes.size() > 1) | ||
| 933 | { | ||
| 934 | std::list<std::string> back = phonemes; | ||
| 935 | |||
| 936 | while (back.size() > 1) | ||
| 937 | { | ||
| 938 | back.pop_back(); | ||
| 939 | |||
| 940 | if (tokenized.count(back) && !visited.count(back)) | ||
| 941 | { | ||
| 942 | db_.insertIntoTable( | ||
| 943 | "merophony", | ||
| 944 | { | ||
| 945 | { "merophone_id", tokenized[back]->getId() }, | ||
| 946 | { "holophone_id", pronunciation->getId() } | ||
| 947 | }); | ||
| 948 | break; | ||
| 949 | } | ||
| 950 | } | ||
| 951 | }*/ | ||
| 952 | } | ||
| 701 | } | 953 | } |
| 702 | 954 | ||
| 703 | void generator::readWordNetAntonymy() | 955 | void generator::readWordNetAntonymy() |
| @@ -1316,7 +1568,19 @@ namespace verbly { | |||
| 1316 | { | 1568 | { |
| 1317 | if (!formByText_.count(text)) | 1569 | if (!formByText_.count(text)) |
| 1318 | { | 1570 | { |
| 1319 | forms_.emplace_back(text); | 1571 | std::string sortedText = text; |
| 1572 | std::sort(sortedText.begin(), sortedText.end()); | ||
| 1573 | |||
| 1574 | int anagramSetId; | ||
| 1575 | if (anagramSets_.count(sortedText)) | ||
| 1576 | { | ||
| 1577 | anagramSetId = anagramSets_[sortedText]; | ||
| 1578 | } else { | ||
| 1579 | anagramSetId = anagramSets_.size(); | ||
| 1580 | anagramSets_[sortedText] = anagramSetId; | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | forms_.emplace_back(text, anagramSetId); | ||
| 1320 | formByText_[text] = &forms_.back(); | 1584 | formByText_[text] = &forms_.back(); |
| 1321 | } | 1585 | } |
| 1322 | 1586 | ||
