From 90fbd47ca02f1a723134302ca978a7f9ef0eac04 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Sat, 2 Dec 2023 17:03:54 -0500 Subject: Starting to get some puzzles randomized --- generator/.clang-format | 2 + generator/CMakeLists.txt | 9 + generator/generator.cpp | 611 +++++++++++++++++++++++++++++++++++++++++++++++ generator/generator.h | 114 +++++++++ generator/main.cpp | 35 +++ 5 files changed, 771 insertions(+) create mode 100644 generator/.clang-format create mode 100644 generator/CMakeLists.txt create mode 100644 generator/generator.cpp create mode 100644 generator/generator.h create mode 100644 generator/main.cpp (limited to 'generator') diff --git a/generator/.clang-format b/generator/.clang-format new file mode 100644 index 0000000..8de7fe6 --- /dev/null +++ b/generator/.clang-format @@ -0,0 +1,2 @@ +--- + BasedOnStyle: Google diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt new file mode 100644 index 0000000..343e5cb --- /dev/null +++ b/generator/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required (VERSION 3.1) +project (generator) + +include_directories(../vendor/hkutil) + +add_executable(generator generator.cpp main.cpp) +set_property(TARGET generator PROPERTY CXX_STANDARD 17) +set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) +#target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) diff --git a/generator/generator.cpp b/generator/generator.cpp new file mode 100644 index 0000000..7ab69b5 --- /dev/null +++ b/generator/generator.cpp @@ -0,0 +1,611 @@ +#include "generator.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +constexpr int MIN_FREQUENCY = 2000000; + +namespace { + +std::list readFile(std::string path, bool uniq = false) { + std::ifstream file(path); + if (!file) { + throw std::invalid_argument("Could not find file " + path); + } + + std::list lines; + std::string line; + while (std::getline(file, line)) { + if (line.back() == '\r') { + line.pop_back(); + } + + lines.push_back(line); + } + + if (uniq) { + std::vector uniq(std::begin(lines), std::end(lines)); + lines.clear(); + + std::sort(std::begin(uniq), std::end(uniq)); + std::unique_copy(std::begin(uniq), std::end(uniq), + std::back_inserter(lines)); + } + + return lines; +} + +} // namespace + +generator::generator(std::string agidPath, std::string wordNetPath, + std::string cmudictPath, std::string wordfreqPath, + std::string outputPath) + : agidPath_(agidPath), + wordNetPath_(wordNetPath), + cmudictPath_(cmudictPath), + wordfreqPath_(wordfreqPath), + outputPath_(outputPath) { + // Ensure AGID infl.txt exists + if (!std::ifstream(agidPath_)) { + throw std::invalid_argument("AGID infl.txt file not found"); + } + + // Add directory separator to WordNet path + if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { + wordNetPath_ += '/'; + } + + // Ensure WordNet tables exist + for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", + "mp", "ms", "per", "sa", "sim", "syntax"}) { + if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { + throw std::invalid_argument("WordNet " + table + " table not found"); + } + } + + // Ensure CMUDICT file exists + if (!std::ifstream(cmudictPath_)) { + throw std::invalid_argument("CMUDICT file not found"); + } +} + +void generator::run() { + std::unordered_map word_frequencies; + { + std::list lines(readFile(wordfreqPath_)); + + hatkirby::progress ppgs("Reading word frequencies...", lines.size()); + + for (std::string line : lines) { + ppgs.update(); + + std::regex freqline("([a-z]+),([0-9]+)"); + std::smatch freqline_data; + if (std::regex_search(line, freqline_data, freqline)) { + std::string text = freqline_data[1]; + std::string freqnumstr = freqline_data[2]; + long long freqnumnum = std::atoll(freqnumstr.c_str()); + word_frequencies[text] = freqnumnum > std::numeric_limits::max() + ? std::numeric_limits::max() + : freqnumnum; + } + } + } + + { + std::list lines(readFile(wordNetPath_ + "wn_s.pl")); + hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); + + std::set> wnid_and_wnum; + for (std::string line : lines) { + ppgs.update(); + + std::regex relation( + "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); + + std::smatch relation_data; + if (!std::regex_search(line, relation_data, relation)) { + continue; + } + + int synset_id = std::stoi(relation_data[1]); + int wnum = std::stoi(relation_data[2]); + std::string text = relation_data[3]; + int tag_count = std::stoi(relation_data[4]); + size_t word_it; + while ((word_it = text.find("''")) != std::string::npos) { + text.erase(word_it, 1); + } + + // The word must be common enough. + if (word_frequencies[text] < MIN_FREQUENCY) { + continue; + } + + // We are looking for single words. + if (std::count(std::begin(text), std::end(text), ' ') > 0) { + continue; + } + + // This should filter our proper nouns. + if (std::any_of(std::begin(text), std::end(text), ::isupper)) { + continue; + } + + // The WordNet data does contain duplicates, so we need to check that we + // haven't already created this word. + std::pair lookup(synset_id, wnum); + if (wnid_and_wnum.count(lookup)) { + continue; + } + + wnid_and_wnum.insert(lookup); + + size_t word_id = LookupOrCreateWord(text); + AddWordToSynset(word_id, synset_id); + } + } + + { + std::list lines(readFile(agidPath_)); + hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); + + for (std::string line : lines) { + ppgs.update(); + + int divider = line.find_first_of(" "); + std::string infinitive = line.substr(0, divider); + line = line.substr(divider + 1); + char type = line[0]; + + if (line[1] == '?') { + line.erase(0, 4); + } else { + line.erase(0, 3); + } + + if (!word_by_base_.count(infinitive) && + !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { + continue; + } + + size_t word_id = LookupOrCreateWord(infinitive); + + auto inflWordList = hatkirby::split>(line, " | "); + + std::vector> agidForms; + for (std::string inflForms : inflWordList) { + auto inflFormList = + hatkirby::split>(std::move(inflForms), ", "); + + std::list forms; + for (std::string inflForm : inflFormList) { + int sympos = inflForm.find_first_of("~> inflections; + switch (type) { + case 'V': { + if (agidForms.size() == 4) { + inflections.push_back(agidForms[0]); + inflections.push_back(agidForms[1]); + inflections.push_back(agidForms[2]); + inflections.push_back(agidForms[3]); + } else if (agidForms.size() == 3) { + inflections.push_back(agidForms[0]); + inflections.push_back(agidForms[1]); + inflections.push_back(agidForms[2]); + } else if (agidForms.size() == 8) { + // As of AGID 2014.08.11, this is only "to be" + inflections.push_back(agidForms[0]); + inflections.push_back(agidForms[2]); + inflections.push_back(agidForms[3]); + inflections.push_back(agidForms[4]); + } else { + // Words that don't fit the cases above as of AGID 2014.08.11: + // - may and shall do not conjugate the way we want them to + // - methinks only has a past tense and is an outlier + // - wit has five forms, and is archaic/obscure enough that we can + // ignore it for now + std::cout << " Ignoring verb \"" << infinitive + << "\" due to non-standard number of forms." << std::endl; + } + + break; + } + + case 'A': { + if (agidForms.size() == 2) { + inflections.push_back(agidForms[0]); + inflections.push_back(agidForms[1]); + } else { + // As of AGID 2014.08.11, this is only "only", which has only the + // form "onliest" + std::cout << " Ignoring adjective/adverb \"" << infinitive + << "\" due to non-standard number of forms." << std::endl; + } + + break; + } + + case 'N': { + if (agidForms.size() == 1) { + inflections.push_back(agidForms[0]); + } else { + // As of AGID 2014.08.11, this is non-existent. + std::cout << " Ignoring noun \"" << infinitive + << "\" due to non-standard number of forms." << std::endl; + } + + break; + } + } + + // Compile the forms we have mapped. + for (const std::list& infl_list : inflections) { + for (const std::string& infl : infl_list) { + size_t form_id = LookupOrCreateForm(infl); + AddFormToWord(form_id, word_id); + } + } + } + } + + word_frequencies.clear(); // Not needed anymore. + + { + std::list lines(readFile(cmudictPath_)); + + hatkirby::progress ppgs("Reading pronunciations from CMUDICT...", + lines.size()); + + for (std::string line : lines) { + ppgs.update(); + + std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); + std::smatch phoneme_data; + if (std::regex_search(line, phoneme_data, phoneme)) { + std::string canonical = hatkirby::lowercase(phoneme_data[1]); + + if (!form_by_text_.count(canonical)) { + continue; + } + + std::string phonemes = phoneme_data[2]; + size_t pronunciation_id = LookupOrCreatePronunciation(phonemes); + AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]); + } + } + } + + std::cout << "Words: " << words_.size() << std::endl; + std::cout << "Forms: " << forms_.size() << std::endl; + std::cout << "Pronunciations: " << pronunciations_.size() << std::endl; + + // White Top + { + hatkirby::progress ppgs("Generating white top puzzles...", forms_.size()); + + for (Form& form : forms_) { + ppgs.update(); + + for (size_t p_id : form.pronunciation_ids) { + const Pronunciation& pronunciation = pronunciations_.at(p_id); + for (size_t other_form_id : pronunciation.form_ids) { + if (other_form_id != form.id) { + form.puzzles[kWhiteTop].insert(other_form_id); + } + } + } + } + } + + // White Bottom + { + hatkirby::progress ppgs("Generating white bottom puzzles...", + words_.size()); + + for (const Word& word : words_) { + ppgs.update(); + + Form& form = forms_.at(word.base_form_id); + for (size_t synset_id : word.synsets) { + for (size_t other_word_id : synsets_.at(synset_id)) { + if (other_word_id != word.id) { + const Word& other_word = words_.at(other_word_id); + form.puzzles[kWhiteBottom].insert(other_word.base_form_id); + } + } + } + } + } + + // Yellow Top + { + hatkirby::progress ppgs("Generating yellow top puzzles...", + anaphone_sets_.size()); + + for (const std::vector& anaphone_set : anaphone_sets_) { + ppgs.update(); + + std::set all_forms; + for (size_t p_id : anaphone_set) { + const Pronunciation& pronunciation = pronunciations_.at(p_id); + for (size_t form_id : pronunciation.form_ids) { + all_forms.insert(form_id); + } + } + + for (size_t f_id1 : all_forms) { + for (size_t f_id2 : all_forms) { + if (f_id1 != f_id2) { + Form& form = forms_.at(f_id1); + form.puzzles[kYellowTop].insert(f_id2); + } + } + } + } + } + + // Yellow Middle + { + hatkirby::progress ppgs("Generating yellow middle puzzles...", + anagram_sets_.size()); + + for (const std::vector& anagram_set : anagram_sets_) { + ppgs.update(); + + for (size_t f_id1 : anagram_set) { + for (size_t f_id2 : anagram_set) { + if (f_id1 != f_id2) { + Form& form = forms_.at(f_id1); + form.puzzles[kYellowMiddle].insert(f_id2); + } + } + } + } + } + + // Black Top + { + hatkirby::progress ppgs("Generating black top puzzles...", + pronunciations_.size()); + + for (const Pronunciation& pronunciation : pronunciations_) { + ppgs.update(); + + auto reversed_list = hatkirby::split>( + pronunciation.stressless_phonemes, " "); + std::reverse(reversed_list.begin(), reversed_list.end()); + std::string reversed_phonemes = + hatkirby::implode(reversed_list.begin(), reversed_list.end(), " "); + if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) { + std::set all_forms; + + for (size_t p_id : + pronunciations_by_blank_phonemes_.at(reversed_phonemes)) { + const Pronunciation& other_pronunciation = pronunciations_.at(p_id); + for (size_t form_id : other_pronunciation.form_ids) { + all_forms.insert(form_id); + } + } + + for (size_t f_id1 : pronunciation.form_ids) { + for (size_t f_id2 : all_forms) { + Form& form = forms_.at(f_id1); + form.puzzles[kBlackTop].insert(f_id2); + } + } + } + } + } + + // Black Middle + { + hatkirby::progress ppgs("Generating black middle puzzles...", + forms_.size()); + + for (Form& form : forms_) { + ppgs.update(); + + std::string reversed_text = form.text; + std::reverse(reversed_text.begin(), reversed_text.end()); + + if (form_by_text_.count(reversed_text)) { + form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text)); + } + } + } + + // Count up all of the generated puzzles. + int total_puzzles = 0; + int reusable_words = 0; + std::unordered_map per_puzzle_type; + for (const Form& form : forms_) { + for (const auto& [puzzle_type, puzzles] : form.puzzles) { + total_puzzles += puzzles.size(); + per_puzzle_type[puzzle_type]++; + } + if (form.puzzles.size() > 1) { + reusable_words++; + } + } + std::cout << "Puzzles: " << total_puzzles << std::endl; + std::cout << "Reusable words: " << reusable_words << std::endl; + std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl; + std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl; + std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl; + std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle] + << std::endl; + std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl; + std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl; +} + +size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) { + if (pronunciation_by_phonemes_.count(phonemes)) { + return pronunciation_by_phonemes_[phonemes]; + } else { + size_t pronunciation_id = pronunciations_.size(); + + auto phonemeList = hatkirby::split>(phonemes, " "); + + std::list::iterator rhymeStart = + std::find_if(std::begin(phonemeList), std::end(phonemeList), + [](std::string phoneme) { + return phoneme.find("1") != std::string::npos; + }); + + // Rhyme detection + std::string prerhyme = ""; + std::string rhyme = ""; + if (rhymeStart != std::end(phonemeList)) { + std::list rhymePhonemes; + + std::transform( + rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), + [](std::string phoneme) { + std::string naked; + + std::remove_copy_if(std::begin(phoneme), std::end(phoneme), + std::back_inserter(naked), + [](char ch) { return std::isdigit(ch); }); + + return naked; + }); + + rhyme = hatkirby::implode(std::begin(rhymePhonemes), + std::end(rhymePhonemes), " "); + + if (rhymeStart != std::begin(phonemeList)) { + prerhyme = *std::prev(rhymeStart); + } + + pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id); + } + + std::string stressless; + for (int i = 0; i < phonemes.size(); i++) { + if (!std::isdigit(phonemes[i])) { + stressless.push_back(phonemes[i]); + } + } + auto stresslessList = + hatkirby::split>(stressless, " "); + std::string stresslessPhonemes = + hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); + std::sort(stresslessList.begin(), stresslessList.end()); + std::string sortedPhonemes = + hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); + + pronunciations_.push_back({.id = pronunciation_id, + .phonemes = phonemes, + .prerhyme = prerhyme, + .rhyme = rhyme, + .stressless_phonemes = stresslessPhonemes}); + + AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes); + + pronunciation_by_phonemes_[phonemes] = pronunciation_id; + pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back( + pronunciation_id); + + return pronunciation_id; + } +} + +size_t generator::LookupOrCreateForm(const std::string& word) { + if (form_by_text_.count(word)) { + return form_by_text_[word]; + } else { + size_t form_id = forms_.size(); + form_by_text_[word] = form_id; + forms_.push_back({.id = form_id, .text = word}); + + std::string sortedText = word; + std::sort(sortedText.begin(), sortedText.end()); + AddFormToAnagramSet(form_id, sortedText); + + return form_id; + } +} + +size_t generator::LookupOrCreateWord(const std::string& word) { + if (word_by_base_.count(word)) { + return word_by_base_[word]; + } else { + size_t word_id = words_.size(); + word_by_base_[word] = words_.size(); + size_t form_id = LookupOrCreateForm(word); + words_.push_back({.id = word_id, .base_form_id = form_id}); + AddFormToWord(form_id, word_id); + return word_id; + } +} + +void generator::AddPronunciationToForm(size_t pronunciation_id, + size_t form_id) { + pronunciations_[pronunciation_id].form_ids.push_back(form_id); + forms_[form_id].pronunciation_ids.push_back(pronunciation_id); +} + +void generator::AddFormToWord(size_t form_id, size_t word_id) { + words_[word_id].form_ids.push_back(form_id); + forms_[form_id].word_ids.push_back(word_id); +} + +void generator::AddWordToSynset(size_t word_id, int wnid) { + if (!synset_by_wnid_.count(wnid)) { + synset_by_wnid_[wnid] = synsets_.size(); + synsets_.push_back({word_id}); + words_[word_id].synsets.push_back(synsets_.size() - 1); + } else { + size_t synset_id = synset_by_wnid_[wnid]; + synsets_[synset_id].push_back(word_id); + words_[word_id].synsets.push_back(synset_id); + } +} + +void generator::AddFormToAnagramSet(size_t form_id, + const std::string& sorted_letters) { + if (!anagram_set_by_sorted_letters_.count(sorted_letters)) { + anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size(); + anagram_sets_.push_back({form_id}); + forms_[form_id].anagram_set_id = anagram_sets_.size() - 1; + } else { + size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters]; + anagram_sets_[anagram_set_id].push_back(form_id); + forms_[form_id].anagram_set_id = anagram_set_id; + } +} + +void generator::AddPronunciationToAnaphoneSet( + size_t pronunciation_id, const std::string& sorted_phonemes) { + if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) { + anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size(); + anaphone_sets_.push_back({pronunciation_id}); + pronunciations_[pronunciation_id].anaphone_set_id = + anaphone_sets_.size() - 1; + } else { + size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes]; + anaphone_sets_[anaphone_set_id].push_back(pronunciation_id); + pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id; + } +} diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..a97b0b0 --- /dev/null +++ b/generator/generator.h @@ -0,0 +1,114 @@ +#ifndef GENERATOR_H_D5C6A724 +#define GENERATOR_H_D5C6A724 + +#include +#include +#include +#include +#include + +enum PuzzleType { + kWhiteTop = 0, + kWhiteBottom = 1, + kYellowTop = 2, + kYellowMiddle = 3, + kBlackTop = 4, + kBlackMiddle = 5, +}; + +class generator { + public: + // Constructor + + generator(std::string agidPath, std::string wordNetPath, + std::string cmudictPath, std::string wordfreqPath, + std::string outputPath); + + // Action + + void run(); + + private: + // Helpers + + size_t LookupOrCreatePronunciation(const std::string& phonemes); + + size_t LookupOrCreateForm(const std::string& form); + + size_t LookupOrCreateWord(const std::string& word); + + void AddPronunciationToForm(size_t pronunciation_id, size_t form_id); + + void AddFormToWord(size_t form_id, size_t word_id); + + void AddWordToSynset(size_t word_id, int wnid); + + void AddFormToAnagramSet(size_t form_id, const std::string& sorted_letters); + + void AddPronunciationToAnaphoneSet(size_t pronunciation_id, + const std::string& sorted_phonemes); + + // Input + + std::string agidPath_; + std::string wordNetPath_; + std::string cmudictPath_; + std::string wordfreqPath_; + + // Output + + std::string outputPath_; + + // Indexes + + struct Pronunciation { + size_t id; + std::string phonemes; + std::string prerhyme; + std::string rhyme; + std::vector form_ids; + std::optional anaphone_set_id; + std::string stressless_phonemes; + }; + + struct Form { + size_t id; + std::string text; + std::vector word_ids; + std::vector pronunciation_ids; + std::optional anagram_set_id; + std::optional reverse_form_id; + + std::unordered_map> puzzles; + }; + + struct Word { + size_t id; + size_t base_form_id; + std::vector form_ids; + std::vector synsets; + }; + + std::vector pronunciations_; + std::unordered_map pronunciation_by_phonemes_; + std::unordered_map> pronunciations_by_rhyme_; + std::unordered_map> + pronunciations_by_blank_phonemes_; + + std::vector> anaphone_sets_; + std::unordered_map anaphone_set_by_sorted_phonemes_; + + std::vector
forms_; + std::unordered_map form_by_text_; + + std::vector> anagram_sets_; + std::unordered_map anagram_set_by_sorted_letters_; + + std::vector words_; + std::unordered_map word_by_base_; + + std::vector> synsets_; + std::unordered_map synset_by_wnid_; +}; + +#endif /* end of include guard: GENERATOR_H_D5C6A724 */ \ No newline at end of file diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..c958421 --- /dev/null +++ b/generator/main.cpp @@ -0,0 +1,35 @@ +#include +#include + +#include "generator.h" + +void printUsage() { + std::cout << "usage: generator agid wordnet cmudict wordfreq output" + << std::endl; + std::cout << "agid :: path to an AGID infl.txt file" << std::endl; + std::cout << "wordnet :: path to a WordNet prolog data directory" + << std::endl; + std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; + std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; + std::cout << "output :: datafile output path" << std::endl; +} + +int main(int argc, char** argv) { + if (argc == 6) { + try { + generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); + + try { + app.run(); + } catch (const std::exception& e) { + std::cout << e.what() << std::endl; + } + } catch (const std::exception& e) { + std::cout << e.what() << std::endl; + printUsage(); + } + } else { + std::cout << "lingo randomizer generator" << std::endl; + printUsage(); + } +} -- cgit 1.4.1