From 6816abc1e89fd955524d7c772477d6483d12cbf9 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Wed, 30 Nov 2022 17:58:44 -0500 Subject: De-duped pronunciations in generated database Identical pronunciations will now share an idea and be re-used by multiple forms. This has a negligible effect on database size, but it's useful for writing queries looking for words with the exact same pronunciations. This constitutes a minor database update, which we will call d1.2. --- generator/generator.cpp | 12 +++++++++--- generator/generator.h | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/generator/generator.cpp b/generator/generator.cpp index 0d073be..ad665a2 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -573,9 +573,15 @@ namespace verbly { } std::string phonemes = phoneme_data[2]; - pronunciations_.emplace_back(phonemes); - pronunciation& p = pronunciations_.back(); - formByText_.at(canonical)->addPronunciation(p); + if (pronunciationByPhonemes_.count(phonemes)) { + pronunciation& p = *pronunciationByPhonemes_[phonemes]; + formByText_.at(canonical)->addPronunciation(p); + } else { + pronunciations_.emplace_back(phonemes); + pronunciation& p = pronunciations_.back(); + pronunciationByPhonemes_[phonemes] = &p; + formByText_.at(canonical)->addPronunciation(p); + } } } } diff --git a/generator/generator.h b/generator/generator.h index 1547b7c..2cd2ba9 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -139,6 +139,7 @@ namespace verbly { std::map> wordsByBaseForm_; std::map lemmaByBaseForm_; std::map formByText_; + std::map pronunciationByPhonemes_; // Caches -- cgit 1.4.1