diff options
| author | Star Rauchenberger <fefferburbia@gmail.com> | 2022-11-30 17:58:44 -0500 |
|---|---|---|
| committer | Star Rauchenberger <fefferburbia@gmail.com> | 2022-11-30 17:58:44 -0500 |
| commit | 6816abc1e89fd955524d7c772477d6483d12cbf9 (patch) | |
| tree | b8707bdb5e180ae7be9d2ddf0ccfbeb539f36361 /generator | |
| parent | 38c17f093615a16a4b4ec6dc2b5d3edb5c1d3895 (diff) | |
| download | verbly-hkutil.tar.gz verbly-hkutil.tar.bz2 verbly-hkutil.zip | |
De-duped pronunciations in generated database hkutil
Identical pronunciations will now share an idea and be re-used by multiple forms. This has a negligible effect on database size, but it's useful for writing queries looking for words with the exact same pronunciations. This constitutes a minor database update, which we will call d1.2.
Diffstat (limited to 'generator')
| -rw-r--r-- | generator/generator.cpp | 12 | ||||
| -rw-r--r-- | generator/generator.h | 1 |
2 files changed, 10 insertions, 3 deletions
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 0d073be..ad665a2 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -573,9 +573,15 @@ namespace verbly { | |||
| 573 | } | 573 | } |
| 574 | 574 | ||
| 575 | std::string phonemes = phoneme_data[2]; | 575 | std::string phonemes = phoneme_data[2]; |
| 576 | pronunciations_.emplace_back(phonemes); | 576 | if (pronunciationByPhonemes_.count(phonemes)) { |
| 577 | pronunciation& p = pronunciations_.back(); | 577 | pronunciation& p = *pronunciationByPhonemes_[phonemes]; |
| 578 | formByText_.at(canonical)->addPronunciation(p); | 578 | formByText_.at(canonical)->addPronunciation(p); |
| 579 | } else { | ||
| 580 | pronunciations_.emplace_back(phonemes); | ||
| 581 | pronunciation& p = pronunciations_.back(); | ||
| 582 | pronunciationByPhonemes_[phonemes] = &p; | ||
| 583 | formByText_.at(canonical)->addPronunciation(p); | ||
| 584 | } | ||
| 579 | } | 585 | } |
| 580 | } | 586 | } |
| 581 | } | 587 | } |
| diff --git a/generator/generator.h b/generator/generator.h index 1547b7c..2cd2ba9 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
| @@ -139,6 +139,7 @@ namespace verbly { | |||
| 139 | std::map<std::string, std::set<word*>> wordsByBaseForm_; | 139 | std::map<std::string, std::set<word*>> wordsByBaseForm_; |
| 140 | std::map<std::string, lemma*> lemmaByBaseForm_; | 140 | std::map<std::string, lemma*> lemmaByBaseForm_; |
| 141 | std::map<std::string, form*> formByText_; | 141 | std::map<std::string, form*> formByText_; |
| 142 | std::map<std::string, pronunciation*> pronunciationByPhonemes_; | ||
| 142 | 143 | ||
| 143 | // Caches | 144 | // Caches |
| 144 | 145 | ||
