summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorStar Rauchenberger <fefferburbia@gmail.com>2022-11-30 17:58:44 -0500
committerStar Rauchenberger <fefferburbia@gmail.com>2022-11-30 17:58:44 -0500
commit6816abc1e89fd955524d7c772477d6483d12cbf9 (patch)
treeb8707bdb5e180ae7be9d2ddf0ccfbeb539f36361
parent38c17f093615a16a4b4ec6dc2b5d3edb5c1d3895 (diff)
downloadverbly-6816abc1e89fd955524d7c772477d6483d12cbf9.tar.gz
verbly-6816abc1e89fd955524d7c772477d6483d12cbf9.tar.bz2
verbly-6816abc1e89fd955524d7c772477d6483d12cbf9.zip
De-duped pronunciations in generated database hkutil
Identical pronunciations will now share an idea and be re-used by multiple forms. This has a negligible effect on database size, but it's useful for writing queries looking for words with the exact same pronunciations.

This constitutes a minor database update, which we will call d1.2.
-rw-r--r--generator/generator.cpp12
-rw-r--r--generator/generator.h1
2 files changed, 10 insertions, 3 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 0d073be..ad665a2 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -573,9 +573,15 @@ namespace verbly {
573 } 573 }
574 574
575 std::string phonemes = phoneme_data[2]; 575 std::string phonemes = phoneme_data[2];
576 pronunciations_.emplace_back(phonemes); 576 if (pronunciationByPhonemes_.count(phonemes)) {
577 pronunciation& p = pronunciations_.back(); 577 pronunciation& p = *pronunciationByPhonemes_[phonemes];
578 formByText_.at(canonical)->addPronunciation(p); 578 formByText_.at(canonical)->addPronunciation(p);
579 } else {
580 pronunciations_.emplace_back(phonemes);
581 pronunciation& p = pronunciations_.back();
582 pronunciationByPhonemes_[phonemes] = &p;
583 formByText_.at(canonical)->addPronunciation(p);
584 }
579 } 585 }
580 } 586 }
581 } 587 }
diff --git a/generator/generator.h b/generator/generator.h index 1547b7c..2cd2ba9 100644 --- a/generator/generator.h +++ b/generator/generator.h
@@ -139,6 +139,7 @@ namespace verbly {
139 std::map<std::string, std::set<word*>> wordsByBaseForm_; 139 std::map<std::string, std::set<word*>> wordsByBaseForm_;
140 std::map<std::string, lemma*> lemmaByBaseForm_; 140 std::map<std::string, lemma*> lemmaByBaseForm_;
141 std::map<std::string, form*> formByText_; 141 std::map<std::string, form*> formByText_;
142 std::map<std::string, pronunciation*> pronunciationByPhonemes_;
142 143
143 // Caches 144 // Caches
144 145