#include "generator.h" #include #include #include #include #include #include #include #include #include #include #include constexpr int MIN_FREQUENCY = 2000000; namespace { std::list readFile(std::string path, bool uniq = false) { std::ifstream file(path); if (!file) { throw std::invalid_argument("Could not find file " + path); } std::list lines; std::string line; while (std::getline(file, line)) { if (line.back() == '\r') { line.pop_back(); } lines.push_back(line); } if (uniq) { std::vector uniq(std::begin(lines), std::end(lines)); lines.clear(); std::sort(std::begin(uniq), std::end(uniq)); std::unique_copy(std::begin(uniq), std::end(uniq), std::back_inserter(lines)); } return lines; } } // namespace generator::generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, std::string outputPath) : agidPath_(agidPath), wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), wordfreqPath_(wordfreqPath), outputPath_(outputPath) { // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { throw std::invalid_argument("AGID infl.txt file not found"); } // Add directory separator to WordNet path if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { wordNetPath_ += '/'; } // Ensure WordNet tables exist for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"}) { if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { throw std::invalid_argument("WordNet " + table + " table not found"); } } // Ensure CMUDICT file exists if (!std::ifstream(cmudictPath_)) { throw std::invalid_argument("CMUDICT file not found"); } } void generator::run() { std::unordered_map word_frequencies; { std::list lines(readFile(wordfreqPath_)); hatkirby::progress ppgs("Reading word frequencies...", lines.size()); for (std::string line : lines) { ppgs.update(); std::regex freqline("([a-z]+),([0-9]+)"); std::smatch freqline_data; if (std::regex_search(line, freqline_data, freqline)) { std::string text = freqline_data[1]; std::string freqnumstr = freqline_data[2]; long long freqnumnum = std::atoll(freqnumstr.c_str()); word_frequencies[text] = freqnumnum > std::numeric_limits::max() ? std::numeric_limits::max() : freqnumnum; } } } { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); std::set> wnid_and_wnum; for (std::string line : lines) { ppgs.update(); std::regex relation( "^s\$([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\$\\.$"); std::smatch relation_data; if (!std::regex_search(line, relation_data, relation)) { continue; } int synset_id = std::stoi(relation_data[1]); int wnum = std::stoi(relation_data[2]); std::string text = relation_data[3]; int tag_count = std::stoi(relation_data[4]); size_t word_it; while ((word_it = text.find("''")) != std::string::npos) { text.erase(word_it, 1); } // The word must be common enough. if (word_frequencies[text] < MIN_FREQUENCY) { continue; } // We are looking for single words. if (std::count(std::begin(text), std::end(text), ' ') > 0) { continue; } // This should filter our proper nouns. if (std::any_of(std::begin(text), std::end(text), ::isupper)) { continue; } // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); if (wnid_and_wnum.count(lookup)) { continue; } wnid_and_wnum.insert(lookup); size_t word_id = LookupOrCreateWord(text); AddWordToSynset(word_id, synset_id); } } { std::list lines(readFile(agidPath_)); hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); for (std::string line : lines) { ppgs.update(); int divider = line.find_first_of(" "); std::string infinitive = line.substr(0, divider); line = line.substr(divider + 1); char type = line[0]; if (line[1] == '?') { line.erase(0, 4); } else { line.erase(0, 3); } if (!word_by_base_.count(infinitive) && !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { continue; } size_t word_id = LookupOrCreateWord(infinitive); auto inflWordList = hatkirby::split>(line, " | "); std::vector> agidForms; for (std::string inflForms : inflWordList) { auto inflFormList = hatkirby::split>(std::move(inflForms), ", "); std::list forms; for (std::string inflForm : inflFormList) { int sympos = inflForm.find_first_of("~> inflections; switch (type) { case 'V': { if (agidForms.size() == 4) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); inflections.push_back(agidForms[2]); inflections.push_back(agidForms[3]); } else if (agidForms.size() == 3) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); inflections.push_back(agidForms[2]); } else if (agidForms.size() == 8) { // As of AGID 2014.08.11, this is only "to be" inflections.push_back(agidForms[0]); inflections.push_back(agidForms[2]); inflections.push_back(agidForms[3]); inflections.push_back(agidForms[4]); } else { // Words that don't fit the cases above as of AGID 2014.08.11: // - may and shall do not conjugate the way we want them to // - methinks only has a past tense and is an outlier // - wit has five forms, and is archaic/obscure enough that we can // ignore it for now std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } case 'A': { if (agidForms.size() == 2) { inflections.push_back(agidForms[0]); inflections.push_back(agidForms[1]); } else { // As of AGID 2014.08.11, this is only "only", which has only the // form "onliest" std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } case 'N': { if (agidForms.size() == 1) { inflections.push_back(agidForms[0]); } else { // As of AGID 2014.08.11, this is non-existent. std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl; } break; } } // Compile the forms we have mapped. for (const std::list& infl_list : inflections) { for (const std::string& infl : infl_list) { size_t form_id = LookupOrCreateForm(infl); AddFormToWord(form_id, word_id); } } } } word_frequencies.clear(); // Not needed anymore. { std::list lines(readFile(cmudictPath_)); hatkirby::progress ppgs("Reading pronunciations from CMUDICT...", lines.size()); for (std::string line : lines) { ppgs.update(); std::regex phoneme("([A-Z][^ \$]*)(?:\\(\\d+\$)? ([A-Z 0-9]+)"); std::smatch phoneme_data; if (std::regex_search(line, phoneme_data, phoneme)) { std::string canonical = hatkirby::lowercase(phoneme_data[1]); if (!form_by_text_.count(canonical)) { continue; } std::string phonemes = phoneme_data[2]; size_t pronunciation_id = LookupOrCreatePronunciation(phonemes); AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]); } } } std::cout << "Words: " << words_.size() << std::endl; std::cout << "Forms: " << forms_.size() << std::endl; std::cout << "Pronunciations: " << pronunciations_.size() << std::endl; // White Top { hatkirby::progress ppgs("Generating white top puzzles...", forms_.size()); for (Form& form : forms_) { ppgs.update(); for (size_t p_id : form.pronunciation_ids) { const Pronunciation& pronunciation = pronunciations_.at(p_id); for (size_t other_form_id : pronunciation.form_ids) { if (other_form_id != form.id) { form.puzzles[kWhiteTop].insert(other_form_id); } } } } } // White Bottom { hatkirby::progress ppgs("Generating white bottom puzzles...", words_.size()); for (const Word& word : words_) { ppgs.update(); Form& form = forms_.at(word.base_form_id); for (size_t synset_id : word.synsets) { for (size_t other_word_id : synsets_.at(synset_id)) { if (other_word_id != word.id) { const Word& other_word = words_.at(other_word_id); form.puzzles[kWhiteBottom].insert(other_word.base_form_id); } } } } } // Yellow Top { hatkirby::progress ppgs("Generating yellow top puzzles...", anaphone_sets_.size()); for (const std::vector& anaphone_set : anaphone_sets_) { ppgs.update(); std::set all_forms; for (size_t p_id : anaphone_set) { const Pronunciation& pronunciation = pronunciations_.at(p_id); for (size_t form_id : pronunciation.form_ids) { all_forms.insert(form_id); } } for (size_t f_id1 : all_forms) { for (size_t f_id2 : all_forms) { if (f_id1 != f_id2) { Form& form = forms_.at(f_id1); form.puzzles[kYellowTop].insert(f_id2); } } } } } // Yellow Middle { hatkirby::progress ppgs("Generating yellow middle puzzles...", anagram_sets_.size()); for (const std::vector& anagram_set : anagram_sets_) { ppgs.update(); for (size_t f_id1 : anagram_set) { for (size_t f_id2 : anagram_set) { if (f_id1 != f_id2) { Form& form = forms_.at(f_id1); form.puzzles[kYellowMiddle].insert(f_id2); } } } } } // Black Top { hatkirby::progress ppgs("Generating black top puzzles...", pronunciations_.size()); for (const Pronunciation& pronunciation : pronunciations_) { ppgs.update(); auto reversed_list = hatkirby::split>( pronunciation.stressless_phonemes, " "); std::reverse(reversed_list.begin(), reversed_list.end()); std::string reversed_phonemes = hatkirby::implode(reversed_list.begin(), reversed_list.end(), " "); if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) { std::set all_forms; for (size_t p_id : pronunciations_by_blank_phonemes_.at(reversed_phonemes)) { const Pronunciation& other_pronunciation = pronunciations_.at(p_id); for (size_t form_id : other_pronunciation.form_ids) { all_forms.insert(form_id); } } for (size_t f_id1 : pronunciation.form_ids) { for (size_t f_id2 : all_forms) { Form& form = forms_.at(f_id1); form.puzzles[kBlackTop].insert(f_id2); } } } } } // Black Middle { hatkirby::progress ppgs("Generating black middle puzzles...", forms_.size()); for (Form& form : forms_) { ppgs.update(); std::string reversed_text = form.text; std::reverse(reversed_text.begin(), reversed_text.end()); if (form_by_text_.count(reversed_text)) { form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text)); } } } // Count up all of the generated puzzles. int total_puzzles = 0; int reusable_words = 0; std::unordered_map per_puzzle_type; for (const Form& form : forms_) { for (const auto& [puzzle_type, puzzles] : form.puzzles) { total_puzzles += puzzles.size(); per_puzzle_type[puzzle_type]++; } if (form.puzzles.size() > 1) { reusable_words++; } } std::cout << "Puzzles: " << total_puzzles << std::endl; std::cout << "Reusable words: " << reusable_words << std::endl; std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl; std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl; std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl; std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle] << std::endl; std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl; std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl; } size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) { if (pronunciation_by_phonemes_.count(phonemes)) { return pronunciation_by_phonemes_[phonemes]; } else { size_t pronunciation_id = pronunciations_.size(); auto phonemeList = hatkirby::split>(phonemes, " "); std::list::iterator rhymeStart = std::find_if(std::begin(phonemeList), std::end(phonemeList), [](std::string phoneme) { return phoneme.find("1") != std::string::npos; }); // Rhyme detection std::string prerhyme = ""; std::string rhyme = ""; if (rhymeStart != std::end(phonemeList)) { std::list rhymePhonemes; std::transform( rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), [](std::string phoneme) { std::string naked; std::remove_copy_if(std::begin(phoneme), std::end(phoneme), std::back_inserter(naked), [](char ch) { return std::isdigit(ch); }); return naked; }); rhyme = hatkirby::implode(std::begin(rhymePhonemes), std::end(rhymePhonemes), " "); if (rhymeStart != std::begin(phonemeList)) { prerhyme = *std::prev(rhymeStart); } pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id); } std::string stressless; for (int i = 0; i < phonemes.size(); i++) { if (!std::isdigit(phonemes[i])) { stressless.push_back(phonemes[i]); } } auto stresslessList = hatkirby::split>(stressless, " "); std::string stresslessPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); std::sort(stresslessList.begin(), stresslessList.end()); std::string sortedPhonemes = hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); pronunciations_.push_back({.id = pronunciation_id, .phonemes = phonemes, .prerhyme = prerhyme, .rhyme = rhyme, .stressless_phonemes = stresslessPhonemes}); AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes); pronunciation_by_phonemes_[phonemes] = pronunciation_id; pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back( pronunciation_id); return pronunciation_id; } } size_t generator::LookupOrCreateForm(const std::string& word) { if (form_by_text_.count(word)) { return form_by_text_[word]; } else { size_t form_id = forms_.size(); form_by_text_[word] = form_id; forms_.push_back({.id = form_id, .text = word}); std::string sortedText = word; std::sort(sortedText.begin(), sortedText.end()); AddFormToAnagramSet(form_id, sortedText); return form_id; } } size_t generator::LookupOrCreateWord(const std::string& word) { if (word_by_base_.count(word)) { return word_by_base_[word]; } else { size_t word_id = words_.size(); word_by_base_[word] = words_.size(); size_t form_id = LookupOrCreateForm(word); words_.push_back({.id = word_id, .base_form_id = form_id}); AddFormToWord(form_id, word_id); return word_id; } } void generator::AddPronunciationToForm(size_t pronunciation_id, size_t form_id) { pronunciations_[pronunciation_id].form_ids.push_back(form_id); forms_[form_id].pronunciation_ids.push_back(pronunciation_id); } void generator::AddFormToWord(size_t form_id, size_t word_id) { words_[word_id].form_ids.push_back(form_id); forms_[form_id].word_ids.push_back(word_id); } void generator::AddWordToSynset(size_t word_id, int wnid) { if (!synset_by_wnid_.count(wnid)) { synset_by_wnid_[wnid] = synsets_.size(); synsets_.push_back({word_id}); words_[word_id].synsets.push_back(synsets_.size() - 1); } else { size_t synset_id = synset_by_wnid_[wnid]; synsets_[synset_id].push_back(word_id); words_[word_id].synsets.push_back(synset_id); } } void generator::AddFormToAnagramSet(size_t form_id, const std::string& sorted_letters) { if (!anagram_set_by_sorted_letters_.count(sorted_letters)) { anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size(); anagram_sets_.push_back({form_id}); forms_[form_id].anagram_set_id = anagram_sets_.size() - 1; } else { size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters]; anagram_sets_[anagram_set_id].push_back(form_id); forms_[form_id].anagram_set_id = anagram_set_id; } } void generator::AddPronunciationToAnaphoneSet( size_t pronunciation_id, const std::string& sorted_phonemes) { if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) { anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size(); anaphone_sets_.push_back({pronunciation_id}); pronunciations_[pronunciation_id].anaphone_set_id = anaphone_sets_.size() - 1; } else { size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes]; anaphone_sets_[anaphone_set_id].push_back(pronunciation_id); pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id; } }