diff options
author | Star Rauchenberger <fefferburbia@gmail.com> | 2022-11-30 17:58:44 -0500 |
---|---|---|
committer | Star Rauchenberger <fefferburbia@gmail.com> | 2022-11-30 17:58:44 -0500 |
commit | 6816abc1e89fd955524d7c772477d6483d12cbf9 (patch) | |
tree | b8707bdb5e180ae7be9d2ddf0ccfbeb539f36361 | |
parent | 38c17f093615a16a4b4ec6dc2b5d3edb5c1d3895 (diff) | |
download | verbly-6816abc1e89fd955524d7c772477d6483d12cbf9.tar.gz verbly-6816abc1e89fd955524d7c772477d6483d12cbf9.tar.bz2 verbly-6816abc1e89fd955524d7c772477d6483d12cbf9.zip |
De-duped pronunciations in generated database hkutil
Identical pronunciations will now share an idea and be re-used by multiple forms. This has a negligible effect on database size, but it's useful for writing queries looking for words with the exact same pronunciations. This constitutes a minor database update, which we will call d1.2.
-rw-r--r-- | generator/generator.cpp | 12 | ||||
-rw-r--r-- | generator/generator.h | 1 |
2 files changed, 10 insertions, 3 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 0d073be..ad665a2 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -573,9 +573,15 @@ namespace verbly { | |||
573 | } | 573 | } |
574 | 574 | ||
575 | std::string phonemes = phoneme_data[2]; | 575 | std::string phonemes = phoneme_data[2]; |
576 | pronunciations_.emplace_back(phonemes); | 576 | if (pronunciationByPhonemes_.count(phonemes)) { |
577 | pronunciation& p = pronunciations_.back(); | 577 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; |
578 | formByText_.at(canonical)->addPronunciation(p); | 578 | formByText_.at(canonical)->addPronunciation(p); |
579 | } else { | ||
580 | pronunciations_.emplace_back(phonemes); | ||
581 | pronunciation& p = pronunciations_.back(); | ||
582 | pronunciationByPhonemes_[phonemes] = &p; | ||
583 | formByText_.at(canonical)->addPronunciation(p); | ||
584 | } | ||
579 | } | 585 | } |
580 | } | 586 | } |
581 | } | 587 | } |
diff --git a/generator/generator.h b/generator/generator.h index 1547b7c..2cd2ba9 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
@@ -139,6 +139,7 @@ namespace verbly { | |||
139 | std::map<std::string, std::set<word*>> wordsByBaseForm_; | 139 | std::map<std::string, std::set<word*>> wordsByBaseForm_; |
140 | std::map<std::string, lemma*> lemmaByBaseForm_; | 140 | std::map<std::string, lemma*> lemmaByBaseForm_; |
141 | std::map<std::string, form*> formByText_; | 141 | std::map<std::string, form*> formByText_; |
142 | std::map<std::string, pronunciation*> pronunciationByPhonemes_; | ||
142 | 143 | ||
143 | // Caches | 144 | // Caches |
144 | 145 | ||