#include "generator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include constexpr int MIN_FREQUENCY = 2000000; namespace { std::list readFile(std::string path, bool uniq = false) { std::ifstream file(path); if (!file) { throw std::invalid_argument("Could not find file " + path); } std::list lines; std::string line; while (std::getline(file, line)) { if (line.back() == '\r') { line.pop_back(); } lines.push_back(line); } if (uniq) { std::vector uniq(std::begin(lines), std::end(lines)); lines.clear(); std::sort(std::begin(uniq), std::end(uniq)); std::unique_copy(std::begin(uniq), std::end(uniq), std::back_inserter(lines)); } return lines; } } // namespace generator::generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, std::string datadirPath, std::string outputPath) : agidPath_(agidPath), wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), wordfreqPath_(wordfreqPath), datadirPath_(datadirPath), outputPath_(outputPath) { // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { throw std::invalid_argument("AGID infl.txt file not found"); } // Add directory separator to WordNet path if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { wordNetPath_ += '/'; } // Ensure WordNet tables exist for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"}) { if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { throw std::invalid_argument("WordNet " + table + " table not found"); } } // Ensure CMUDICT file exists if (!std::ifstream(cmudictPath_)) { throw std::invalid_argument("CMUDICT file not found"); } } void generator::run() { std::unordered_map word_frequencies; { std::list lines(readFile(wordfreqPath_)); hatkirby::progress ppgs("Reading word frequencies...", lines.size()); for (std::string line : lines) { ppgs.update(); std::regex freqline("([a-z]+),([0-9]+)"); std::smatch freqline_data; if (std::regex_search(line, freqline_data, freqline)) { std::string text = freqline_data[1]; std::string freqnumstr = freqline_data[2]; long long freqnumnum = std::atoll(freqnumstr.c_str()); word_frequencies[text] = freqnumnum > std::numeric_limits::max() ? std::numeric_limits::max() : freqnumnum; } } } std::unordered_set profane; { std::list lines(readFile(datadirPath_ / "profane.txt")); for (const std::string& line : lines) { profane.insert(line); } } { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); for (std::string line : lines) { ppgs.update(); std::regex relation( "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } int synset_id = std::stoi(relation_data[1]); int wnum = std::stoi(relation_data[2]); std::string text = relation_data[3]; int tag_count = std::stoi(relation_data[4]); size_t word_it; while ((word_it = text.find("''")) != std::string::npos) { text.erase(word_it, 1); } // The word must be common enough. if (word_frequencies[text] < MIN_FREQUENCY) { continue; } // We are looking for single words. if (std::count(std::begin(text), std::end(text), ' ') > 0) { continue; } // This should filter our proper nouns. if (std::any_of(std::begin(text), std::end(text), ::isupper)) { continue; } // Ignore any profane words. if (profane.count(text)) { continue; } // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); if (word_by_wnid_and_wnum_.count(lookup)) { continue; } size_t word_id = LookupOrCreateWord(text); word_by_wnid_and_wnum_[lookup] = word_id; AddWordToSynset(word_id, synset_id); } } { std::list lines(readFile(agidPath_)); hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); for (std::string line : lines) { ppgs.update(); int divider = line.find_first_of(" "); std::string infinitive = line.substr(0, divider); line = line.substr(divider + 1); char type = line[0]; if (line[1] == '?') { line.erase(0, 4); } else { line.erase(0, 3); } if (!words_by_base_.count(infinitive)) { continue; } auto inflWordList = hatkirby::split>(line, " | "); std::vector> agidForms; for (std::string inflForms : inflWordList) { auto inflFormList = hatkirby::split>(std::move(inflForms), ", "); std::list forms; for (std::string inflForm : inflFormList) { int sympos = inflForm.find_first_of("~> inflections; switch (type) { case 'V': { if (agidForms.size() == 4) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); inflections.push_back(agidForms[2]); inflections.push_back(agidForms[3]); } else if (agidForms.size() == 3) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); inflections.push_back(agidForms[2]); } else if (agidForms.size() == 8) { // As of AGID 2014.08.11, this is only "to be" inflections.push_back(agidForms[0]); inflections.push_back(agidForms[2]); inflections.push_back(agidForms[3]); inflections.push_back(agidForms[4]); } else { // Words that don't fit the cases above as of AGID 2014.08.11: // - may and shall do not conjugate the way we want them to // - methinks only has a past tense and is an outlier // - wit has five forms, and is archaic/obscure enough that we can // ignore it for now std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } case 'A': { if (agidForms.size() == 2) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); } else { // As of AGID 2014.08.11, this is only "only", which has only the // form "onliest" std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } case 'N': { if (agidForms.size() == 1) { inflections.push_back(agidForms[0]); } else { // As of AGID 2014.08.11, this is non-existent. std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } } // Compile the forms we have mapped. for (size_t word_id : words_by_base_.at(infinitive)) { for (const std::list& infl_list : inflections) { for (const std::string& infl : infl_list) { if (!profane.count(infl)) { size_t form_id = LookupOrCreateForm(infl); AddFormToWord(form_id, word_id); } } } } } } word_frequencies.clear(); // Not needed anymore. { std::list lines(readFile(cmudictPath_)); hatkirby::progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); for (std::string line : lines) { ppgs.update(); std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); std::smatch phoneme_data; if (std::regex_search(line, phoneme_data, phoneme)) { std::string canonical = hatkirby::lowercase(phoneme_data[1]); if (!form_by_text_.count(canonical)) { continue; } std::string phonemes = phoneme_data[2]; size_t pronunciation_id = LookupOrCreatePronunciation(phonemes); AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]); } } } std::cout << "Words: " << words_.size() << std::endl; std::cout << "Forms: " << forms_.size() << std::endl; std::cout << "Pronunciations: " << pronunciations_.size() << std::endl; // White Top { hatkirby::progress ppgs("Generating white top puzzles...", forms_.size()); for (Form& form : forms_) { ppgs.update(); for (size_t p_id : form.pronunciation_ids) { const Pronunciation& pronunciation = pronunciations_.at(p_id); for (size_t other_form_id : pronunciation.form_ids) { if (other_form_id != form.id) { form.puzzles[kWhiteTop].insert(other_form_id); } } } } } // White Bottom { hatkirby::progress ppgs("Generating white bottom puzzles...", words_.size()); for (const Word& word : words_) { ppgs.update(); Form& form = forms_.at(word.base_form_id); for (size_t synset_id : word.synsets) { for (size_t other_word_id : synsets_.at(synset_id)) { if (other_word_id != word.id) { const Word& other_word = words_.at(other_word_id); form.puzzles[kWhiteBottom].insert(other_word.base_form_id); } } } } } // Yellow Top { hatkirby::progress ppgs("Generating yellow top puzzles...", anaphone_sets_.size()); for (const std::vector& anaphone_set : anaphone_sets_) { ppgs.update(); std::set all_forms; for (size_t p_id : anaphone_set) { const Pronunciation& pronunciation = pronunciations_.at(p_id); for (size_t form_id : pronunciation.form_ids) { all_forms.insert(form_id); } } for (size_t f_id1 : all_forms) { for (size_t f_id2 : all_forms) { if (f_id1 != f_id2) { Form& form = forms_.at(f_id1); Form& form2 = forms_.at(f_id2); if (form.anagram_set_id == form2.anagram_set_id) { continue; } form.puzzles[kYellowTop].insert(f_id2); } } } } } // Yellow Middle { hatkirby::progress ppgs("Generating yellow middle puzzles...", anagram_sets_.size()); for (const std::vector& anagram_set : anagram_sets_) { ppgs.update(); for (size_t f_id1 : anagram_set) { for (size_t f_id2 : anagram_set) { if (f_id1 != f_id2) { Form& form = forms_.at(f_id1); form.puzzles[kYellowMiddle].insert(f_id2); } } } } } // Black Top { hatkirby::progress ppgs("Generating black top puzzles...", pronunciations_.size()); for (const Pronunciation& pronunciation : pronunciations_) { ppgs.update(); auto reversed_list = hatkirby::split>( pronunciation.stressless_phonemes, " "); std::reverse(reversed_list.begin(), reversed_list.end()); std::string reversed_phonemes = hatkirby::implode(reversed_list.begin(), reversed_list.end(), " "); if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) { std::set all_forms; for (size_t p_id : pronunciations_by_blank_phonemes_.at(reversed_phonemes)) { const Pronunciation& other_pronunciation = pronunciations_.at(p_id); for (size_t form_id : other_pronunciation.form_ids) { all_forms.insert(form_id); } } for (size_t f_id1 : pronunciation.form_ids) { for (size_t f_id2 : all_forms) { Form& form = forms_.at(f_id1); if (form.reverse_form_id == f_id2) { continue; } form.puzzles[kBlackTop].insert(f_id2); } } } } } // Black Middle { hatkirby::progress ppgs("Generating black middle puzzles...", forms_.size()); for (Form& form : forms_) { ppgs.update(); std::string reversed_text = form.text; std::reverse(reversed_text.begin(), reversed_text.end()); if (form_by_text_.count(reversed_text)) { form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text)); } } } // Black Bottom std::unordered_map> antonyms; { std::list lines(readFile(wordNetPath_ + "wn_ant.pl", true)); hatkirby::progress ppgs("Generating black bottom puzzles...", lines.size()); for (const std::string& line : lines) { ppgs.update(); std::regex relation( "^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } std::pair lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2])); std::pair lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4])); if (word_by_wnid_and_wnum_.count(lookup1) && word_by_wnid_and_wnum_.count(lookup2)) { const Word& word1 = words_.at(word_by_wnid_and_wnum_.at(lookup1)); const Word& word2 = words_.at(word_by_wnid_and_wnum_.at(lookup2)); Form& form1 = forms_.at(word1.base_form_id); form1.puzzles[kBlackBottom].insert(word2.base_form_id); antonyms[word1.id].insert(word2.id); } } } // Black Double Bottom { hatkirby::progress ppgs("Generating black double bottom puzzles...", antonyms.size()); for (const auto& [word1, ant_words] : antonyms) { ppgs.update(); for (size_t word2 : ant_words) { const Word& word2_obj = words_.at(word2); const Form& form2 = forms_.at(word2_obj.base_form_id); for (size_t word25 : form2.word_ids) { if (word25 == word2) { continue; } const auto& double_ant_words = antonyms[word25]; for (size_t word3 : double_ant_words) { const Word& word1_obj = words_.at(word1); const Word& word3_obj = words_.at(word3); bool synset_overlap = false; for (size_t synset1 : word1_obj.synsets) { for (size_t synset3 : word3_obj.synsets) { if (synset1 == synset3) { synset_overlap = true; break; } } if (synset_overlap) { break; } } if (!synset_overlap) { Form& form1 = forms_.at(word1_obj.base_form_id); form1.puzzles[kDoubleBlackBottom].insert(word3_obj.base_form_id); } } } } } } // Red/Blue Top { std::map, std::vector> tokenized; for (const auto& [phonemes, pronunciations] : pronunciations_by_blank_phonemes_) { tokenized[hatkirby::split>(phonemes, " ")] = pronunciations; } hatkirby::progress ppgs("Generating top red/blue puzzles...", tokenized.size()); for (const auto& [phonemes, pronunciations] : tokenized) { ppgs.update(); std::set> visited; for (int i = 0; i < phonemes.size(); i++) { for (int l = 2; l <= phonemes.size() - i; l++) { if (i == 0 && l == phonemes.size()) { continue; } std::list sublist; for (auto j = std::next(phonemes.begin(), i); j != std::next(phonemes.begin(), i + l); j++) { sublist.push_back(*j); } if (tokenized.count(sublist) && !visited.count(sublist)) { visited.insert(sublist); for (size_t holophone_id : pronunciations) { for (size_t merophone_id : tokenized[sublist]) { const Pronunciation& holophone = pronunciations_.at(holophone_id); const Pronunciation& merophone = pronunciations_.at(merophone_id); for (size_t holo_form_id : holophone.form_ids) { Form& holo_form = forms_.at(holo_form_id); for (size_t mero_form_id : merophone.form_ids) { Form& mero_form = forms_.at(mero_form_id); if (holo_form.text.find(mero_form.text) != std::string::npos) { // We don't want top puzzles that are also middle puzzles. continue; } bool word_overlap = false; for (size_t holo_word_id : holo_form.word_ids) { for (size_t mero_word_id : mero_form.word_ids) { if (holo_word_id == mero_word_id) { word_overlap = true; break; } } if (word_overlap) { break; } } if (!word_overlap) { if (holo_form.text.size() <= mero_form.text.size() + 5) { holo_form.puzzles[kBlueTop].insert(mero_form_id); } mero_form.puzzles[kRedTop].insert(holo_form_id); } } } } } } } } } } // Red/Blue Middle std::unordered_map> left_shorter_by_longer; std::unordered_map> left_longer_by_shorter; std::unordered_map> right_shorter_by_longer; std::unordered_map> right_longer_by_shorter; { hatkirby::progress ppgs("Generating red/blue middle puzzles...", form_by_text_.size()); for (const auto& [text, form_id] : form_by_text_) { ppgs.update(); Form& holograph = forms_.at(form_id); std::unordered_set visited; for (int i = 0; i < text.size(); i++) { for (int l = 3; l <= text.size() - i; l++) { if (i == 0 && l == text.size()) { continue; } std::string substr = text.substr(i, l); if (form_by_text_.count(substr) && !visited.count(substr)) { visited.insert(substr); Form& merograph = forms_.at(form_by_text_.at(substr)); bool word_overlap = false; for (size_t holo_word_id : holograph.word_ids) { for (size_t mero_word_id : merograph.word_ids) { if (holo_word_id == mero_word_id) { word_overlap = true; break; } } if (word_overlap) { break; } } if (!word_overlap) { if (holograph.text.size() <= merograph.text.size() + 4) { holograph.puzzles[kBlueMiddle].insert(merograph.id); if (i == 0) { left_shorter_by_longer[form_id].insert(merograph.id); left_longer_by_shorter[merograph.id].insert(form_id); } else if (i + l == text.size()) { right_shorter_by_longer[form_id].insert(merograph.id); right_longer_by_shorter[merograph.id].insert(form_id); } } merograph.puzzles[kRedMiddle].insert(form_id); } } } } } } // Purple Middle { hatkirby::progress ppgs( "Generating purple middle puzzles...", left_shorter_by_longer.size() + right_shorter_by_longer.size()); for (const auto& [holograph_id, merograph_ids] : left_shorter_by_longer) { ppgs.update(); Form& holograph = forms_.at(holograph_id); for (size_t merograph_id : merograph_ids) { const Form& merograph = forms_.at(merograph_id); for (size_t other_id : left_longer_by_shorter[merograph_id]) { if (other_id != holograph_id) { const Form& other_form = forms_.at(other_id); if (holograph.text[merograph.text.size()] != other_form.text[merograph.text.size()]) { holograph.puzzles[kPurpleMiddle].insert(other_id); } } } } } for (const auto& [holograph_id, merograph_ids] : right_shorter_by_longer) { ppgs.update(); Form& holograph = forms_.at(holograph_id); for (size_t merograph_id : merograph_ids) { const Form& merograph = forms_.at(merograph_id); for (size_t other_id : right_longer_by_shorter[merograph_id]) { if (other_id != holograph_id) { const Form& other_form = forms_.at(other_id); if (holograph .text[holograph.text.size() - merograph.text.size() - 1] != other_form .text[other_form.text.size() - merograph.text.size() - 1]) { holograph.puzzles[kPurpleMiddle].insert(other_id); } } } } } } // Red/Blue Bottom std::unordered_map> meronyms_by_holonym; { std::list lines(readFile(wordNetPath_ + "wn_mm.pl")); hatkirby::progress ppgs("Reading member meronymy...", lines.size()); for (auto line : lines) { ppgs.update(); std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); if (synset_by_wnid_.count(lookup1) && synset_by_wnid_.count(lookup2)) { for (size_t word_id1 : synsets_.at(synset_by_wnid_.at(lookup1))) { for (size_t word_id2 : synsets_.at(synset_by_wnid_.at(lookup2))) { meronyms_by_holonym[word_id1].insert(word_id2); } } } } } { std::list lines(readFile(wordNetPath_ + "wn_mp.pl")); hatkirby::progress ppgs("Reading part meronymy...", lines.size()); for (auto line : lines) { ppgs.update(); std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); if (synset_by_wnid_.count(lookup1) && synset_by_wnid_.count(lookup2)) { for (size_t word_id1 : synsets_.at(synset_by_wnid_.at(lookup1))) { for (size_t word_id2 : synsets_.at(synset_by_wnid_.at(lookup2))) { meronyms_by_holonym[word_id1].insert(word_id2); } } } } } { std::list lines(readFile(wordNetPath_ + "wn_ms.pl")); hatkirby::progress ppgs("Reading substance meronymy...", lines.size()); for (auto line : lines) { ppgs.update(); std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\."); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } int lookup1 = std::stoi(relation_data[1]); int lookup2 = std::stoi(relation_data[2]); if (synset_by_wnid_.count(lookup1) && synset_by_wnid_.count(lookup2)) { for (size_t word_id1 : synsets_.at(synset_by_wnid_.at(lookup1))) { for (size_t word_id2 : synsets_.at(synset_by_wnid_.at(lookup2))) { meronyms_by_holonym[word_id1].insert(word_id2); } } } } } { hatkirby::progress ppgs("Generating red/blue bottom puzzles...", meronyms_by_holonym.size()); for (const auto& [holonym_id, meronym_ids] : meronyms_by_holonym) { ppgs.update(); for (size_t meronym_id : meronym_ids) { const Word& holonym_word = words_.at(holonym_id); const Word& meronym_word = words_.at(meronym_id); Form& holonym_form = forms_.at(holonym_word.base_form_id); Form& meronym_form = forms_.at(meronym_word.base_form_id); holonym_form.puzzles[kBlueBottom].insert(meronym_form.id); meronym_form.puzzles[kRedBottom].insert(holonym_form.id); } } } // Purple Top { hatkirby::progress ppgs("Generating purple top puzzles...", pronunciations_by_rhyme_.size()); for (const auto& [rhyme, pronunciation_ids] : pronunciations_by_rhyme_) { ppgs.update(); for (size_t p_id1 : pronunciation_ids) { const Pronunciation& p1 = pronunciations_.at(p_id1); if (p1.prerhyme.empty()) { continue; } for (size_t p_id2 : pronunciation_ids) { const Pronunciation& p2 = pronunciations_.at(p_id2); if (p2.prerhyme.empty()) { continue; } if (p1.prerhyme != p2.prerhyme) { for (size_t f_id1 : p1.form_ids) { for (size_t f_id2 : p2.form_ids) { if (f_id1 != f_id2) { Form& form1 = forms_.at(f_id1); const Form& form2 = forms_.at(f_id2); if (std::abs(static_cast(form1.text.size()) - static_cast(form2.text.size())) <= 4) { form1.puzzles[kPurpleTop].insert(f_id2); } } } } } } } } } // Count up all of the generated puzzles. int total_puzzles = 0; int reusable_words = 0; std::unordered_map per_puzzle_type; for (const Form& form : forms_) { for (const auto& [puzzle_type, puzzles] : form.puzzles) { total_puzzles += puzzles.size(); per_puzzle_type[puzzle_type]++; } if (form.puzzles.size() > 1) { reusable_words++; } } std::cout << "Puzzles: " << total_puzzles << std::endl; std::cout << "Reusable words: " << reusable_words << std::endl; std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl; std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl; std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl; std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle] << std::endl; std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl; std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl; std::cout << "Black bottoms: " << per_puzzle_type[kBlackBottom] << std::endl; std::cout << "Black double bottoms: " << per_puzzle_type[kDoubleBlackBottom] << std::endl; std::cout << "Red tops: " << per_puzzle_type[kRedTop] << std::endl; std::cout << "Red middles: " << per_puzzle_type[kRedMiddle] << std::endl; std::cout << "Red bottoms: " << per_puzzle_type[kRedBottom] << std::endl; std::cout << "Blue tops: " << per_puzzle_type[kBlueTop] << std::endl; std::cout << "Blue middles: " << per_puzzle_type[kBlueMiddle] << std::endl; std::cout << "Blue bottoms: " << per_puzzle_type[kBlueBottom] << std::endl; std::cout << "Purple tops: " << per_puzzle_type[kPurpleTop] << std::endl; std::cout << "Purple middles: " << per_puzzle_type[kPurpleMiddle] << std::endl; std::vector form_entry; form_entry.reserve(forms_.size()); for (const Form& form : forms_) { if (form.puzzles.empty()) { form_entry.push_back(fmt::format("\"{}\"", form.text)); } else { std::vector entry_per_type; for (const auto& [puzzle_type, puzzles] : form.puzzles) { std::vector entry_per_puzzle; for (size_t puzzle : puzzles) { entry_per_puzzle.push_back(std::to_string(puzzle)); } entry_per_type.push_back( fmt::format("{}:[{}]", static_cast(puzzle_type), hatkirby::implode(entry_per_puzzle, ","))); } form_entry.push_back(fmt::format("[\"{}\",{{{}}}]", form.text, hatkirby::implode(entry_per_type, ","))); } } std::ofstream output_file(outputPath_); output_file << "extends Node\n\nvar forms = [" << hatkirby::implode(form_entry, ",") << "]" << std::endl; } size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) { if (pronunciation_by_phonemes_.count(phonemes)) { return pronunciation_by_phonemes_[phonemes]; } else { size_t pronunciation_id = pronunciations_.size(); auto phonemeList = hatkirby::split>(phonemes, " "); std::list::iterator rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [](std::string phoneme) { return phoneme.find("1") != std::string::npos; }); // Rhyme detection std::string prerhyme = ""; std::string rhyme = ""; if (rhymeStart != std::end(phonemeList)) { std::list rhymePhonemes; std::transform( rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [](std::string phoneme) { std::string naked; std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [](char ch) { return std::isdigit(ch); }); return naked; }); rhyme = hatkirby::implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); if (rhymeStart != std::begin(phonemeList)) { prerhyme = *std::prev(rhymeStart); } pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id); } std::string stressless; for (int i = 0; i < phonemes.size(); i++) { if (!std::isdigit(phonemes[i])) { stressless.push_back(phonemes[i]); } } auto stresslessList = hatkirby::split>(stressless, " "); std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); std::sort(stresslessList.begin(), stresslessList.end()); std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); pronunciations_.push_back({.id = pronunciation_id, .phonemes = phonemes, .prerhyme = prerhyme, .rhyme = rhyme, .stressless_phonemes = stresslessPhonemes}); AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes); pronunciation_by_phonemes_[phonemes] = pronunciation_id; pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back( pronunciation_id); return pronunciation_id; } } size_t generator::LookupOrCreateForm(const std::string& word) { if (form_by_text_.count(word)) { return form_by_text_[word]; } else { size_t form_id = forms_.size(); form_by_text_[word] = form_id; forms_.push_back({.id = form_id, .text = word}); std::string sortedText = word; std::sort(sortedText.begin(), sortedText.end()); AddFormToAnagramSet(form_id, sortedText); return form_id; } } size_t generator::LookupOrCreateWord(const std::string& word) { size_t word_id = words_.size(); words_by_base_[word].push_back(word_id); size_t form_id = LookupOrCreateForm(word); words_.push_back({.id = word_id, .base_form_id = form_id}); AddFormToWord(form_id, word_id); return word_id; } void generator::AddPronunciationToForm(size_t pronunciation_id, size_t form_id) { pronunciations_[pronunciation_id].form_ids.push_back(form_id); forms_[form_id].pronunciation_ids.push_back(pronunciation_id); } void generator::AddFormToWord(size_t form_id, size_t word_id) { words_[word_id].form_ids.push_back(form_id); forms_[form_id].word_ids.push_back(word_id); } void generator::AddWordToSynset(size_t word_id, int wnid) { if (!synset_by_wnid_.count(wnid)) { synset_by_wnid_[wnid] = synsets_.size(); synsets_.push_back({word_id}); words_[word_id].synsets.push_back(synsets_.size() - 1); } else { size_t synset_id = synset_by_wnid_[wnid]; synsets_[synset_id].push_back(word_id); words_[word_id].synsets.push_back(synset_id); } } void generator::AddFormToAnagramSet(size_t form_id, const std::string& sorted_letters) { if (!anagram_set_by_sorted_letters_.count(sorted_letters)) { anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size(); anagram_sets_.push_back({form_id}); forms_[form_id].anagram_set_id = anagram_sets_.size() - 1; } else { size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters]; anagram_sets_[anagram_set_id].push_back(form_id); forms_[form_id].anagram_set_id = anagram_set_id; } } void generator::AddPronunciationToAnaphoneSet( size_t pronunciation_id, const std::string& sorted_phonemes) { if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) { anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size(); anaphone_sets_.push_back({pronunciation_id}); pronunciations_[pronunciation_id].anaphone_set_id = anaphone_sets_.size() - 1; } else { size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes]; anaphone_sets_[anaphone_set_id].push_back(pronunciation_id); pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id; } }