From 90fbd47ca02f1a723134302ca978a7f9ef0eac04 Mon Sep 17 00:00:00 2001
From: Star Rauchenberger <fefferburbia@gmail.com>
Date: Sat, 2 Dec 2023 17:03:54 -0500
Subject: Starting to get some puzzles randomized

---
 generator/generator.cpp | 611 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 611 insertions(+)
 create mode 100644 generator/generator.cpp

(limited to 'generator/generator.cpp')
diff --git a/generator/generator.cpp b/generator/generator.cpp
new file mode 100644
index 0000000..7ab69b5
--- /dev/null
+++ b/generator/generator.cpp
@@ -0,0 +1,611 @@
+#include "generator.h"
+
+#include <hkutil/progress.h>
+#include <hkutil/string.h>
+
+#include <algorithm>
+#include <fstream>
+#include <list>
+#include <regex>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+constexpr int MIN_FREQUENCY = 2000000;
+
+namespace {
+
+std::list<std::string> readFile(std::string path, bool uniq = false) {
+  std::ifstream file(path);
+  if (!file) {
+    throw std::invalid_argument("Could not find file " + path);
+  }
+
+  std::list<std::string> lines;
+  std::string line;
+  while (std::getline(file, line)) {
+    if (line.back() == '\r') {
+      line.pop_back();
+    }
+
+    lines.push_back(line);
+  }
+
+  if (uniq) {
+    std::vector<std::string> uniq(std::begin(lines), std::end(lines));
+    lines.clear();
+
+    std::sort(std::begin(uniq), std::end(uniq));
+    std::unique_copy(std::begin(uniq), std::end(uniq),
+                     std::back_inserter(lines));
+  }
+
+  return lines;
+}
+
+}  // namespace
+
+generator::generator(std::string agidPath, std::string wordNetPath,
+                     std::string cmudictPath, std::string wordfreqPath,
+                     std::string outputPath)
+    : agidPath_(agidPath),
+      wordNetPath_(wordNetPath),
+      cmudictPath_(cmudictPath),
+      wordfreqPath_(wordfreqPath),
+      outputPath_(outputPath) {
+  // Ensure AGID infl.txt exists
+  if (!std::ifstream(agidPath_)) {
+    throw std::invalid_argument("AGID infl.txt file not found");
+  }
+
+  // Add directory separator to WordNet path
+  if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) {
+    wordNetPath_ += '/';
+  }
+
+  // Ensure WordNet tables exist
+  for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm",
+                            "mp", "ms", "per", "sa", "sim", "syntax"}) {
+    if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) {
+      throw std::invalid_argument("WordNet " + table + " table not found");
+    }
+  }
+
+  // Ensure CMUDICT file exists
+  if (!std::ifstream(cmudictPath_)) {
+    throw std::invalid_argument("CMUDICT file not found");
+  }
+}
+
+void generator::run() {
+  std::unordered_map<std::string, int> word_frequencies;
+  {
+    std::list<std::string> lines(readFile(wordfreqPath_));
+
+    hatkirby::progress ppgs("Reading word frequencies...", lines.size());
+
+    for (std::string line : lines) {
+      ppgs.update();
+
+      std::regex freqline("([a-z]+),([0-9]+)");
+      std::smatch freqline_data;
+      if (std::regex_search(line, freqline_data, freqline)) {
+        std::string text = freqline_data[1];
+        std::string freqnumstr = freqline_data[2];
+        long long freqnumnum = std::atoll(freqnumstr.c_str());
+        word_frequencies[text] = freqnumnum > std::numeric_limits<int>::max()
+                                     ? std::numeric_limits<int>::max()
+                                     : freqnumnum;
+      }
+    }
+  }
+
+  {
+    std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
+    hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
+
+    std::set<std::pair<int, int>> wnid_and_wnum;
+    for (std::string line : lines) {
+      ppgs.update();
+
+      std::regex relation(
+          "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
+
+      std::smatch relation_data;
+      if (!std::regex_search(line, relation_data, relation)) {
+        continue;
+      }
+
+      int synset_id = std::stoi(relation_data[1]);
+      int wnum = std::stoi(relation_data[2]);
+      std::string text = relation_data[3];
+      int tag_count = std::stoi(relation_data[4]);
+      size_t word_it;
+      while ((word_it = text.find("''")) != std::string::npos) {
+        text.erase(word_it, 1);
+      }
+
+      // The word must be common enough.
+      if (word_frequencies[text] < MIN_FREQUENCY) {
+        continue;
+      }
+
+      // We are looking for single words.
+      if (std::count(std::begin(text), std::end(text), ' ') > 0) {
+        continue;
+      }
+
+      // This should filter our proper nouns.
+      if (std::any_of(std::begin(text), std::end(text), ::isupper)) {
+        continue;
+      }
+
+      // The WordNet data does contain duplicates, so we need to check that we
+      // haven't already created this word.
+      std::pair<int, int> lookup(synset_id, wnum);
+      if (wnid_and_wnum.count(lookup)) {
+        continue;
+      }
+
+      wnid_and_wnum.insert(lookup);
+
+      size_t word_id = LookupOrCreateWord(text);
+      AddWordToSynset(word_id, synset_id);
+    }
+  }
+
+  {
+    std::list<std::string> lines(readFile(agidPath_));
+    hatkirby::progress ppgs("Reading inflections from AGID...", lines.size());
+
+    for (std::string line : lines) {
+      ppgs.update();
+
+      int divider = line.find_first_of(" ");
+      std::string infinitive = line.substr(0, divider);
+      line = line.substr(divider + 1);
+      char type = line[0];
+
+      if (line[1] == '?') {
+        line.erase(0, 4);
+      } else {
+        line.erase(0, 3);
+      }
+
+      if (!word_by_base_.count(infinitive) &&
+          !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) {
+        continue;
+      }
+
+      size_t word_id = LookupOrCreateWord(infinitive);
+
+      auto inflWordList = hatkirby::split<std::list<std::string>>(line, " | ");
+
+      std::vector<std::list<std::string>> agidForms;
+      for (std::string inflForms : inflWordList) {
+        auto inflFormList =
+            hatkirby::split<std::list<std::string>>(std::move(inflForms), ", ");
+
+        std::list<std::string> forms;
+        for (std::string inflForm : inflFormList) {
+          int sympos = inflForm.find_first_of("~<!? ");
+          if (sympos != std::string::npos) {
+            inflForm = inflForm.substr(0, sympos);
+          }
+
+          forms.push_back(std::move(inflForm));
+        }
+
+        agidForms.push_back(std::move(forms));
+      }
+
+      std::vector<std::list<std::string>> inflections;
+      switch (type) {
+        case 'V': {
+          if (agidForms.size() == 4) {
+            inflections.push_back(agidForms[0]);
+            inflections.push_back(agidForms[1]);
+            inflections.push_back(agidForms[2]);
+            inflections.push_back(agidForms[3]);
+          } else if (agidForms.size() == 3) {
+            inflections.push_back(agidForms[0]);
+            inflections.push_back(agidForms[1]);
+            inflections.push_back(agidForms[2]);
+          } else if (agidForms.size() == 8) {
+            // As of AGID 2014.08.11, this is only "to be"
+            inflections.push_back(agidForms[0]);
+            inflections.push_back(agidForms[2]);
+            inflections.push_back(agidForms[3]);
+            inflections.push_back(agidForms[4]);
+          } else {
+            // Words that don't fit the cases above as of AGID 2014.08.11:
+            // - may and shall do not conjugate the way we want them to
+            // - methinks only has a past tense and is an outlier
+            // - wit has five forms, and is archaic/obscure enough that we can
+            // ignore it for now
+            std::cout << " Ignoring verb \"" << infinitive
+                      << "\" due to non-standard number of forms." << std::endl;
+          }
+
+          break;
+        }
+
+        case 'A': {
+          if (agidForms.size() == 2) {
+            inflections.push_back(agidForms[0]);
+            inflections.push_back(agidForms[1]);
+          } else {
+            // As of AGID 2014.08.11, this is only "only", which has only the
+            // form "onliest"
+            std::cout << " Ignoring adjective/adverb \"" << infinitive
+                      << "\" due to non-standard number of forms." << std::endl;
+          }
+
+          break;
+        }
+
+        case 'N': {
+          if (agidForms.size() == 1) {
+            inflections.push_back(agidForms[0]);
+          } else {
+            // As of AGID 2014.08.11, this is non-existent.
+            std::cout << " Ignoring noun \"" << infinitive
+                      << "\" due to non-standard number of forms." << std::endl;
+          }
+
+          break;
+        }
+      }
+
+      // Compile the forms we have mapped.
+      for (const std::list<std::string>& infl_list : inflections) {
+        for (const std::string& infl : infl_list) {
+          size_t form_id = LookupOrCreateForm(infl);
+          AddFormToWord(form_id, word_id);
+        }
+      }
+    }
+  }
+
+  word_frequencies.clear();  // Not needed anymore.
+
+  {
+    std::list<std::string> lines(readFile(cmudictPath_));
+
+    hatkirby::progress ppgs("Reading pronunciations from CMUDICT...",
+                            lines.size());
+
+    for (std::string line : lines) {
+      ppgs.update();
+
+      std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))?  ([A-Z 0-9]+)");
+      std::smatch phoneme_data;
+      if (std::regex_search(line, phoneme_data, phoneme)) {
+        std::string canonical = hatkirby::lowercase(phoneme_data[1]);
+
+        if (!form_by_text_.count(canonical)) {
+          continue;
+        }
+
+        std::string phonemes = phoneme_data[2];
+        size_t pronunciation_id = LookupOrCreatePronunciation(phonemes);
+        AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]);
+      }
+    }
+  }
+
+  std::cout << "Words: " << words_.size() << std::endl;
+  std::cout << "Forms: " << forms_.size() << std::endl;
+  std::cout << "Pronunciations: " << pronunciations_.size() << std::endl;
+
+  // White Top
+  {
+    hatkirby::progress ppgs("Generating white top puzzles...", forms_.size());
+
+    for (Form& form : forms_) {
+      ppgs.update();
+
+      for (size_t p_id : form.pronunciation_ids) {
+        const Pronunciation& pronunciation = pronunciations_.at(p_id);
+        for (size_t other_form_id : pronunciation.form_ids) {
+          if (other_form_id != form.id) {
+            form.puzzles[kWhiteTop].insert(other_form_id);
+          }
+        }
+      }
+    }
+  }
+
+  // White Bottom
+  {
+    hatkirby::progress ppgs("Generating white bottom puzzles...",
+                            words_.size());
+
+    for (const Word& word : words_) {
+      ppgs.update();
+
+      Form& form = forms_.at(word.base_form_id);
+      for (size_t synset_id : word.synsets) {
+        for (size_t other_word_id : synsets_.at(synset_id)) {
+          if (other_word_id != word.id) {
+            const Word& other_word = words_.at(other_word_id);
+            form.puzzles[kWhiteBottom].insert(other_word.base_form_id);
+          }
+        }
+      }
+    }
+  }
+
+  // Yellow Top
+  {
+    hatkirby::progress ppgs("Generating yellow top puzzles...",
+                            anaphone_sets_.size());
+
+    for (const std::vector<size_t>& anaphone_set : anaphone_sets_) {
+      ppgs.update();
+
+      std::set<size_t> all_forms;
+      for (size_t p_id : anaphone_set) {
+        const Pronunciation& pronunciation = pronunciations_.at(p_id);
+        for (size_t form_id : pronunciation.form_ids) {
+          all_forms.insert(form_id);
+        }
+      }
+
+      for (size_t f_id1 : all_forms) {
+        for (size_t f_id2 : all_forms) {
+          if (f_id1 != f_id2) {
+            Form& form = forms_.at(f_id1);
+            form.puzzles[kYellowTop].insert(f_id2);
+          }
+        }
+      }
+    }
+  }
+
+  // Yellow Middle
+  {
+    hatkirby::progress ppgs("Generating yellow middle puzzles...",
+                            anagram_sets_.size());
+
+    for (const std::vector<size_t>& anagram_set : anagram_sets_) {
+      ppgs.update();
+
+      for (size_t f_id1 : anagram_set) {
+        for (size_t f_id2 : anagram_set) {
+          if (f_id1 != f_id2) {
+            Form& form = forms_.at(f_id1);
+            form.puzzles[kYellowMiddle].insert(f_id2);
+          }
+        }
+      }
+    }
+  }
+
+  // Black Top
+  {
+    hatkirby::progress ppgs("Generating black top puzzles...",
+                            pronunciations_.size());
+
+    for (const Pronunciation& pronunciation : pronunciations_) {
+      ppgs.update();
+
+      auto reversed_list = hatkirby::split<std::vector<std::string>>(
+          pronunciation.stressless_phonemes, " ");
+      std::reverse(reversed_list.begin(), reversed_list.end());
+      std::string reversed_phonemes =
+          hatkirby::implode(reversed_list.begin(), reversed_list.end(), " ");
+      if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) {
+        std::set<size_t> all_forms;
+
+        for (size_t p_id :
+             pronunciations_by_blank_phonemes_.at(reversed_phonemes)) {
+          const Pronunciation& other_pronunciation = pronunciations_.at(p_id);
+          for (size_t form_id : other_pronunciation.form_ids) {
+            all_forms.insert(form_id);
+          }
+        }
+
+        for (size_t f_id1 : pronunciation.form_ids) {
+          for (size_t f_id2 : all_forms) {
+            Form& form = forms_.at(f_id1);
+            form.puzzles[kBlackTop].insert(f_id2);
+          }
+        }
+      }
+    }
+  }
+
+  // Black Middle
+  {
+    hatkirby::progress ppgs("Generating black middle puzzles...",
+                            forms_.size());
+
+    for (Form& form : forms_) {
+      ppgs.update();
+
+      std::string reversed_text = form.text;
+      std::reverse(reversed_text.begin(), reversed_text.end());
+
+      if (form_by_text_.count(reversed_text)) {
+        form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text));
+      }
+    }
+  }
+
+  // Count up all of the generated puzzles.
+  int total_puzzles = 0;
+  int reusable_words = 0;
+  std::unordered_map<PuzzleType, int> per_puzzle_type;
+  for (const Form& form : forms_) {
+    for (const auto& [puzzle_type, puzzles] : form.puzzles) {
+      total_puzzles += puzzles.size();
+      per_puzzle_type[puzzle_type]++;
+    }
+    if (form.puzzles.size() > 1) {
+      reusable_words++;
+    }
+  }
+  std::cout << "Puzzles: " << total_puzzles << std::endl;
+  std::cout << "Reusable words: " << reusable_words << std::endl;
+  std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl;
+  std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl;
+  std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl;
+  std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle]
+            << std::endl;
+  std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl;
+  std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl;
+}
+
+size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) {
+  if (pronunciation_by_phonemes_.count(phonemes)) {
+    return pronunciation_by_phonemes_[phonemes];
+  } else {
+    size_t pronunciation_id = pronunciations_.size();
+
+    auto phonemeList = hatkirby::split<std::list<std::string>>(phonemes, " ");
+
+    std::list<std::string>::iterator rhymeStart =
+        std::find_if(std::begin(phonemeList), std::end(phonemeList),
+                     [](std::string phoneme) {
+                       return phoneme.find("1") != std::string::npos;
+                     });
+
+    // Rhyme detection
+    std::string prerhyme = "";
+    std::string rhyme = "";
+    if (rhymeStart != std::end(phonemeList)) {
+      std::list<std::string> rhymePhonemes;
+
+      std::transform(
+          rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes),
+          [](std::string phoneme) {
+            std::string naked;
+
+            std::remove_copy_if(std::begin(phoneme), std::end(phoneme),
+                                std::back_inserter(naked),
+                                [](char ch) { return std::isdigit(ch); });
+
+            return naked;
+          });
+
+      rhyme = hatkirby::implode(std::begin(rhymePhonemes),
+                                std::end(rhymePhonemes), " ");
+
+      if (rhymeStart != std::begin(phonemeList)) {
+        prerhyme = *std::prev(rhymeStart);
+      }
+
+      pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id);
+    }
+
+    std::string stressless;
+    for (int i = 0; i < phonemes.size(); i++) {
+      if (!std::isdigit(phonemes[i])) {
+        stressless.push_back(phonemes[i]);
+      }
+    }
+    auto stresslessList =
+        hatkirby::split<std::vector<std::string>>(stressless, " ");
+    std::string stresslessPhonemes =
+        hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
+    std::sort(stresslessList.begin(), stresslessList.end());
+    std::string sortedPhonemes =
+        hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
+
+    pronunciations_.push_back({.id = pronunciation_id,
+                               .phonemes = phonemes,
+                               .prerhyme = prerhyme,
+                               .rhyme = rhyme,
+                               .stressless_phonemes = stresslessPhonemes});
+
+    AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes);
+
+    pronunciation_by_phonemes_[phonemes] = pronunciation_id;
+    pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back(
+        pronunciation_id);
+
+    return pronunciation_id;
+  }
+}
+
+size_t generator::LookupOrCreateForm(const std::string& word) {
+  if (form_by_text_.count(word)) {
+    return form_by_text_[word];
+  } else {
+    size_t form_id = forms_.size();
+    form_by_text_[word] = form_id;
+    forms_.push_back({.id = form_id, .text = word});
+
+    std::string sortedText = word;
+    std::sort(sortedText.begin(), sortedText.end());
+    AddFormToAnagramSet(form_id, sortedText);
+
+    return form_id;
+  }
+}
+
+size_t generator::LookupOrCreateWord(const std::string& word) {
+  if (word_by_base_.count(word)) {
+    return word_by_base_[word];
+  } else {
+    size_t word_id = words_.size();
+    word_by_base_[word] = words_.size();
+    size_t form_id = LookupOrCreateForm(word);
+    words_.push_back({.id = word_id, .base_form_id = form_id});
+    AddFormToWord(form_id, word_id);
+    return word_id;
+  }
+}
+
+void generator::AddPronunciationToForm(size_t pronunciation_id,
+                                       size_t form_id) {
+  pronunciations_[pronunciation_id].form_ids.push_back(form_id);
+  forms_[form_id].pronunciation_ids.push_back(pronunciation_id);
+}
+
+void generator::AddFormToWord(size_t form_id, size_t word_id) {
+  words_[word_id].form_ids.push_back(form_id);
+  forms_[form_id].word_ids.push_back(word_id);
+}
+
+void generator::AddWordToSynset(size_t word_id, int wnid) {
+  if (!synset_by_wnid_.count(wnid)) {
+    synset_by_wnid_[wnid] = synsets_.size();
+    synsets_.push_back({word_id});
+    words_[word_id].synsets.push_back(synsets_.size() - 1);
+  } else {
+    size_t synset_id = synset_by_wnid_[wnid];
+    synsets_[synset_id].push_back(word_id);
+    words_[word_id].synsets.push_back(synset_id);
+  }
+}
+
+void generator::AddFormToAnagramSet(size_t form_id,
+                                    const std::string& sorted_letters) {
+  if (!anagram_set_by_sorted_letters_.count(sorted_letters)) {
+    anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size();
+    anagram_sets_.push_back({form_id});
+    forms_[form_id].anagram_set_id = anagram_sets_.size() - 1;
+  } else {
+    size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters];
+    anagram_sets_[anagram_set_id].push_back(form_id);
+    forms_[form_id].anagram_set_id = anagram_set_id;
+  }
+}
+
+void generator::AddPronunciationToAnaphoneSet(
+    size_t pronunciation_id, const std::string& sorted_phonemes) {
+  if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) {
+    anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size();
+    anaphone_sets_.push_back({pronunciation_id});
+    pronunciations_[pronunciation_id].anaphone_set_id =
+        anaphone_sets_.size() - 1;
+  } else {
+    size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes];
+    anaphone_sets_[anaphone_set_id].push_back(pronunciation_id);
+    pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id;
+  }
+}
-- 
cgit 1.4.1