diff options
Diffstat (limited to 'generator')
| -rw-r--r-- | generator/.clang-format | 2 | ||||
| -rw-r--r-- | generator/CMakeLists.txt | 9 | ||||
| -rw-r--r-- | generator/generator.cpp | 611 | ||||
| -rw-r--r-- | generator/generator.h | 114 | ||||
| -rw-r--r-- | generator/main.cpp | 35 | 
5 files changed, 771 insertions, 0 deletions
| diff --git a/generator/.clang-format b/generator/.clang-format new file mode 100644 index 0000000..8de7fe6 --- /dev/null +++ b/generator/.clang-format | |||
| @@ -0,0 +1,2 @@ | |||
| 1 | --- | ||
| 2 | BasedOnStyle: Google | ||
| diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt new file mode 100644 index 0000000..343e5cb --- /dev/null +++ b/generator/CMakeLists.txt | |||
| @@ -0,0 +1,9 @@ | |||
| 1 | cmake_minimum_required (VERSION 3.1) | ||
| 2 | project (generator) | ||
| 3 | |||
| 4 | include_directories(../vendor/hkutil) | ||
| 5 | |||
| 6 | add_executable(generator generator.cpp main.cpp) | ||
| 7 | set_property(TARGET generator PROPERTY CXX_STANDARD 17) | ||
| 8 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) | ||
| 9 | #target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) | ||
| diff --git a/generator/generator.cpp b/generator/generator.cpp new file mode 100644 index 0000000..7ab69b5 --- /dev/null +++ b/generator/generator.cpp | |||
| @@ -0,0 +1,611 @@ | |||
| 1 | #include "generator.h" | ||
| 2 | |||
| 3 | #include <hkutil/progress.h> | ||
| 4 | #include <hkutil/string.h> | ||
| 5 | |||
| 6 | #include <algorithm> | ||
| 7 | #include <fstream> | ||
| 8 | #include <list> | ||
| 9 | #include <regex> | ||
| 10 | #include <set> | ||
| 11 | #include <stdexcept> | ||
| 12 | #include <string> | ||
| 13 | #include <unordered_map> | ||
| 14 | #include <vector> | ||
| 15 | |||
| 16 | constexpr int MIN_FREQUENCY = 2000000; | ||
| 17 | |||
| 18 | namespace { | ||
| 19 | |||
| 20 | std::list<std::string> readFile(std::string path, bool uniq = false) { | ||
| 21 | std::ifstream file(path); | ||
| 22 | if (!file) { | ||
| 23 | throw std::invalid_argument("Could not find file " + path); | ||
| 24 | } | ||
| 25 | |||
| 26 | std::list<std::string> lines; | ||
| 27 | std::string line; | ||
| 28 | while (std::getline(file, line)) { | ||
| 29 | if (line.back() == '\r') { | ||
| 30 | line.pop_back(); | ||
| 31 | } | ||
| 32 | |||
| 33 | lines.push_back(line); | ||
| 34 | } | ||
| 35 | |||
| 36 | if (uniq) { | ||
| 37 | std::vector<std::string> uniq(std::begin(lines), std::end(lines)); | ||
| 38 | lines.clear(); | ||
| 39 | |||
| 40 | std::sort(std::begin(uniq), std::end(uniq)); | ||
| 41 | std::unique_copy(std::begin(uniq), std::end(uniq), | ||
| 42 | std::back_inserter(lines)); | ||
| 43 | } | ||
| 44 | |||
| 45 | return lines; | ||
| 46 | } | ||
| 47 | |||
| 48 | } // namespace | ||
| 49 | |||
| 50 | generator::generator(std::string agidPath, std::string wordNetPath, | ||
| 51 | std::string cmudictPath, std::string wordfreqPath, | ||
| 52 | std::string outputPath) | ||
| 53 | : agidPath_(agidPath), | ||
| 54 | wordNetPath_(wordNetPath), | ||
| 55 | cmudictPath_(cmudictPath), | ||
| 56 | wordfreqPath_(wordfreqPath), | ||
| 57 | outputPath_(outputPath) { | ||
| 58 | // Ensure AGID infl.txt exists | ||
| 59 | if (!std::ifstream(agidPath_)) { | ||
| 60 | throw std::invalid_argument("AGID infl.txt file not found"); | ||
| 61 | } | ||
| 62 | |||
| 63 | // Add directory separator to WordNet path | ||
| 64 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { | ||
| 65 | wordNetPath_ += '/'; | ||
| 66 | } | ||
| 67 | |||
| 68 | // Ensure WordNet tables exist | ||
| 69 | for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", | ||
| 70 | "mp", "ms", "per", "sa", "sim", "syntax"}) { | ||
| 71 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { | ||
| 72 | throw std::invalid_argument("WordNet " + table + " table not found"); | ||
| 73 | } | ||
| 74 | } | ||
| 75 | |||
| 76 | // Ensure CMUDICT file exists | ||
| 77 | if (!std::ifstream(cmudictPath_)) { | ||
| 78 | throw std::invalid_argument("CMUDICT file not found"); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | |||
| 82 | void generator::run() { | ||
| 83 | std::unordered_map<std::string, int> word_frequencies; | ||
| 84 | { | ||
| 85 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
| 86 | |||
| 87 | hatkirby::progress ppgs("Reading word frequencies...", lines.size()); | ||
| 88 | |||
| 89 | for (std::string line : lines) { | ||
| 90 | ppgs.update(); | ||
| 91 | |||
| 92 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
| 93 | std::smatch freqline_data; | ||
| 94 | if (std::regex_search(line, freqline_data, freqline)) { | ||
| 95 | std::string text = freqline_data[1]; | ||
| 96 | std::string freqnumstr = freqline_data[2]; | ||
| 97 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
| 98 | word_frequencies[text] = freqnumnum > std::numeric_limits<int>::max() | ||
| 99 | ? std::numeric_limits<int>::max() | ||
| 100 | : freqnumnum; | ||
| 101 | } | ||
| 102 | } | ||
| 103 | } | ||
| 104 | |||
| 105 | { | ||
| 106 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | ||
| 107 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
| 108 | |||
| 109 | std::set<std::pair<int, int>> wnid_and_wnum; | ||
| 110 | for (std::string line : lines) { | ||
| 111 | ppgs.update(); | ||
| 112 | |||
| 113 | std::regex relation( | ||
| 114 | "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); | ||
| 115 | |||
| 116 | std::smatch relation_data; | ||
| 117 | if (!std::regex_search(line, relation_data, relation)) { | ||
| 118 | continue; | ||
| 119 | } | ||
| 120 | |||
| 121 | int synset_id = std::stoi(relation_data[1]); | ||
| 122 | int wnum = std::stoi(relation_data[2]); | ||
| 123 | std::string text = relation_data[3]; | ||
| 124 | int tag_count = std::stoi(relation_data[4]); | ||
| 125 | size_t word_it; | ||
| 126 | while ((word_it = text.find("''")) != std::string::npos) { | ||
| 127 | text.erase(word_it, 1); | ||
| 128 | } | ||
| 129 | |||
| 130 | // The word must be common enough. | ||
| 131 | if (word_frequencies[text] < MIN_FREQUENCY) { | ||
| 132 | continue; | ||
| 133 | } | ||
| 134 | |||
| 135 | // We are looking for single words. | ||
| 136 | if (std::count(std::begin(text), std::end(text), ' ') > 0) { | ||
| 137 | continue; | ||
| 138 | } | ||
| 139 | |||
| 140 | // This should filter our proper nouns. | ||
| 141 | if (std::any_of(std::begin(text), std::end(text), ::isupper)) { | ||
| 142 | continue; | ||
| 143 | } | ||
| 144 | |||
| 145 | // The WordNet data does contain duplicates, so we need to check that we | ||
| 146 | // haven't already created this word. | ||
| 147 | std::pair<int, int> lookup(synset_id, wnum); | ||
| 148 | if (wnid_and_wnum.count(lookup)) { | ||
| 149 | continue; | ||
| 150 | } | ||
| 151 | |||
| 152 | wnid_and_wnum.insert(lookup); | ||
| 153 | |||
| 154 | size_t word_id = LookupOrCreateWord(text); | ||
| 155 | AddWordToSynset(word_id, synset_id); | ||
| 156 | } | ||
| 157 | } | ||
| 158 | |||
| 159 | { | ||
| 160 | std::list<std::string> lines(readFile(agidPath_)); | ||
| 161 | hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); | ||
| 162 | |||
| 163 | for (std::string line : lines) { | ||
| 164 | ppgs.update(); | ||
| 165 | |||
| 166 | int divider = line.find_first_of(" "); | ||
| 167 | std::string infinitive = line.substr(0, divider); | ||
| 168 | line = line.substr(divider + 1); | ||
| 169 | char type = line[0]; | ||
| 170 | |||
| 171 | if (line[1] == '?') { | ||
| 172 | line.erase(0, 4); | ||
| 173 | } else { | ||
| 174 | line.erase(0, 3); | ||
| 175 | } | ||
| 176 | |||
| 177 | if (!word_by_base_.count(infinitive) && | ||
| 178 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { | ||
| 179 | continue; | ||
| 180 | } | ||
| 181 | |||
| 182 | size_t word_id = LookupOrCreateWord(infinitive); | ||
| 183 | |||
| 184 | auto inflWordList = hatkirby::split<std::list<std::string>>(line, " | "); | ||
| 185 | |||
| 186 | std::vector<std::list<std::string>> agidForms; | ||
| 187 | for (std::string inflForms : inflWordList) { | ||
| 188 | auto inflFormList = | ||
| 189 | hatkirby::split<std::list<std::string>>(std::move(inflForms), ", "); | ||
| 190 | |||
| 191 | std::list<std::string> forms; | ||
| 192 | for (std::string inflForm : inflFormList) { | ||
| 193 | int sympos = inflForm.find_first_of("~<!? "); | ||
| 194 | if (sympos != std::string::npos) { | ||
| 195 | inflForm = inflForm.substr(0, sympos); | ||
| 196 | } | ||
| 197 | |||
| 198 | forms.push_back(std::move(inflForm)); | ||
| 199 | } | ||
| 200 | |||
| 201 | agidForms.push_back(std::move(forms)); | ||
| 202 | } | ||
| 203 | |||
| 204 | std::vector<std::list<std::string>> inflections; | ||
| 205 | switch (type) { | ||
| 206 | case 'V': { | ||
| 207 | if (agidForms.size() == 4) { | ||
| 208 | inflections.push_back(agidForms[0]); | ||
| 209 | inflections.push_back(agidForms[1]); | ||
| 210 | inflections.push_back(agidForms[2]); | ||
| 211 | inflections.push_back(agidForms[3]); | ||
| 212 | } else if (agidForms.size() == 3) { | ||
| 213 | inflections.push_back(agidForms[0]); | ||
| 214 | inflections.push_back(agidForms[1]); | ||
| 215 | inflections.push_back(agidForms[2]); | ||
| 216 | } else if (agidForms.size() == 8) { | ||
| 217 | // As of AGID 2014.08.11, this is only "to be" | ||
| 218 | inflections.push_back(agidForms[0]); | ||
| 219 | inflections.push_back(agidForms[2]); | ||
| 220 | inflections.push_back(agidForms[3]); | ||
| 221 | inflections.push_back(agidForms[4]); | ||
| 222 | } else { | ||
| 223 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
| 224 | // - may and shall do not conjugate the way we want them to | ||
| 225 | // - methinks only has a past tense and is an outlier | ||
| 226 | // - wit has five forms, and is archaic/obscure enough that we can | ||
| 227 | // ignore it for now | ||
| 228 | std::cout << " Ignoring verb \"" << infinitive | ||
| 229 | << "\" due to non-standard number of forms." << std::endl; | ||
| 230 | } | ||
| 231 | |||
| 232 | break; | ||
| 233 | } | ||
| 234 | |||
| 235 | case 'A': { | ||
| 236 | if (agidForms.size() == 2) { | ||
| 237 | inflections.push_back(agidForms[0]); | ||
| 238 | inflections.push_back(agidForms[1]); | ||
| 239 | } else { | ||
| 240 | // As of AGID 2014.08.11, this is only "only", which has only the | ||
| 241 | // form "onliest" | ||
| 242 | std::cout << " Ignoring adjective/adverb \"" << infinitive | ||
| 243 | << "\" due to non-standard number of forms." << std::endl; | ||
| 244 | } | ||
| 245 | |||
| 246 | break; | ||
| 247 | } | ||
| 248 | |||
| 249 | case 'N': { | ||
| 250 | if (agidForms.size() == 1) { | ||
| 251 | inflections.push_back(agidForms[0]); | ||
| 252 | } else { | ||
| 253 | // As of AGID 2014.08.11, this is non-existent. | ||
| 254 | std::cout << " Ignoring noun \"" << infinitive | ||
| 255 | << "\" due to non-standard number of forms." << std::endl; | ||
| 256 | } | ||
| 257 | |||
| 258 | break; | ||
| 259 | } | ||
| 260 | } | ||
| 261 | |||
| 262 | // Compile the forms we have mapped. | ||
| 263 | for (const std::list<std::string>& infl_list : inflections) { | ||
| 264 | for (const std::string& infl : infl_list) { | ||
| 265 | size_t form_id = LookupOrCreateForm(infl); | ||
| 266 | AddFormToWord(form_id, word_id); | ||
| 267 | } | ||
| 268 | } | ||
| 269 | } | ||
| 270 | } | ||
| 271 | |||
| 272 | word_frequencies.clear(); // Not needed anymore. | ||
| 273 | |||
| 274 | { | ||
| 275 | std::list<std::string> lines(readFile(cmudictPath_)); | ||
| 276 | |||
| 277 | hatkirby::progress ppgs("Reading pronunciations from CMUDICT...", | ||
| 278 | lines.size()); | ||
| 279 | |||
| 280 | for (std::string line : lines) { | ||
| 281 | ppgs.update(); | ||
| 282 | |||
| 283 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | ||
| 284 | std::smatch phoneme_data; | ||
| 285 | if (std::regex_search(line, phoneme_data, phoneme)) { | ||
| 286 | std::string canonical = hatkirby::lowercase(phoneme_data[1]); | ||
| 287 | |||
| 288 | if (!form_by_text_.count(canonical)) { | ||
| 289 | continue; | ||
| 290 | } | ||
| 291 | |||
| 292 | std::string phonemes = phoneme_data[2]; | ||
| 293 | size_t pronunciation_id = LookupOrCreatePronunciation(phonemes); | ||
| 294 | AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]); | ||
| 295 | } | ||
| 296 | } | ||
| 297 | } | ||
| 298 | |||
| 299 | std::cout << "Words: " << words_.size() << std::endl; | ||
| 300 | std::cout << "Forms: " << forms_.size() << std::endl; | ||
| 301 | std::cout << "Pronunciations: " << pronunciations_.size() << std::endl; | ||
| 302 | |||
| 303 | // White Top | ||
| 304 | { | ||
| 305 | hatkirby::progress ppgs("Generating white top puzzles...", forms_.size()); | ||
| 306 | |||
| 307 | for (Form& form : forms_) { | ||
| 308 | ppgs.update(); | ||
| 309 | |||
| 310 | for (size_t p_id : form.pronunciation_ids) { | ||
| 311 | const Pronunciation& pronunciation = pronunciations_.at(p_id); | ||
| 312 | for (size_t other_form_id : pronunciation.form_ids) { | ||
| 313 | if (other_form_id != form.id) { | ||
| 314 | form.puzzles[kWhiteTop].insert(other_form_id); | ||
| 315 | } | ||
| 316 | } | ||
| 317 | } | ||
| 318 | } | ||
| 319 | } | ||
| 320 | |||
| 321 | // White Bottom | ||
| 322 | { | ||
| 323 | hatkirby::progress ppgs("Generating white bottom puzzles...", | ||
| 324 | words_.size()); | ||
| 325 | |||
| 326 | for (const Word& word : words_) { | ||
| 327 | ppgs.update(); | ||
| 328 | |||
| 329 | Form& form = forms_.at(word.base_form_id); | ||
| 330 | for (size_t synset_id : word.synsets) { | ||
| 331 | for (size_t other_word_id : synsets_.at(synset_id)) { | ||
| 332 | if (other_word_id != word.id) { | ||
| 333 | const Word& other_word = words_.at(other_word_id); | ||
| 334 | form.puzzles[kWhiteBottom].insert(other_word.base_form_id); | ||
| 335 | } | ||
| 336 | } | ||
| 337 | } | ||
| 338 | } | ||
| 339 | } | ||
| 340 | |||
| 341 | // Yellow Top | ||
| 342 | { | ||
| 343 | hatkirby::progress ppgs("Generating yellow top puzzles...", | ||
| 344 | anaphone_sets_.size()); | ||
| 345 | |||
| 346 | for (const std::vector<size_t>& anaphone_set : anaphone_sets_) { | ||
| 347 | ppgs.update(); | ||
| 348 | |||
| 349 | std::set<size_t> all_forms; | ||
| 350 | for (size_t p_id : anaphone_set) { | ||
| 351 | const Pronunciation& pronunciation = pronunciations_.at(p_id); | ||
| 352 | for (size_t form_id : pronunciation.form_ids) { | ||
| 353 | all_forms.insert(form_id); | ||
| 354 | } | ||
| 355 | } | ||
| 356 | |||
| 357 | for (size_t f_id1 : all_forms) { | ||
| 358 | for (size_t f_id2 : all_forms) { | ||
| 359 | if (f_id1 != f_id2) { | ||
| 360 | Form& form = forms_.at(f_id1); | ||
| 361 | form.puzzles[kYellowTop].insert(f_id2); | ||
| 362 | } | ||
| 363 | } | ||
| 364 | } | ||
| 365 | } | ||
| 366 | } | ||
| 367 | |||
| 368 | // Yellow Middle | ||
| 369 | { | ||
| 370 | hatkirby::progress ppgs("Generating yellow middle puzzles...", | ||
| 371 | anagram_sets_.size()); | ||
| 372 | |||
| 373 | for (const std::vector<size_t>& anagram_set : anagram_sets_) { | ||
| 374 | ppgs.update(); | ||
| 375 | |||
| 376 | for (size_t f_id1 : anagram_set) { | ||
| 377 | for (size_t f_id2 : anagram_set) { | ||
| 378 | if (f_id1 != f_id2) { | ||
| 379 | Form& form = forms_.at(f_id1); | ||
| 380 | form.puzzles[kYellowMiddle].insert(f_id2); | ||
| 381 | } | ||
| 382 | } | ||
| 383 | } | ||
| 384 | } | ||
| 385 | } | ||
| 386 | |||
| 387 | // Black Top | ||
| 388 | { | ||
| 389 | hatkirby::progress ppgs("Generating black top puzzles...", | ||
| 390 | pronunciations_.size()); | ||
| 391 | |||
| 392 | for (const Pronunciation& pronunciation : pronunciations_) { | ||
| 393 | ppgs.update(); | ||
| 394 | |||
| 395 | auto reversed_list = hatkirby::split<std::vector<std::string>>( | ||
| 396 | pronunciation.stressless_phonemes, " "); | ||
| 397 | std::reverse(reversed_list.begin(), reversed_list.end()); | ||
| 398 | std::string reversed_phonemes = | ||
| 399 | hatkirby::implode(reversed_list.begin(), reversed_list.end(), " "); | ||
| 400 | if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) { | ||
| 401 | std::set<size_t> all_forms; | ||
| 402 | |||
| 403 | for (size_t p_id : | ||
| 404 | pronunciations_by_blank_phonemes_.at(reversed_phonemes)) { | ||
| 405 | const Pronunciation& other_pronunciation = pronunciations_.at(p_id); | ||
| 406 | for (size_t form_id : other_pronunciation.form_ids) { | ||
| 407 | all_forms.insert(form_id); | ||
| 408 | } | ||
| 409 | } | ||
| 410 | |||
| 411 | for (size_t f_id1 : pronunciation.form_ids) { | ||
| 412 | for (size_t f_id2 : all_forms) { | ||
| 413 | Form& form = forms_.at(f_id1); | ||
| 414 | form.puzzles[kBlackTop].insert(f_id2); | ||
| 415 | } | ||
| 416 | } | ||
| 417 | } | ||
| 418 | } | ||
| 419 | } | ||
| 420 | |||
| 421 | // Black Middle | ||
| 422 | { | ||
| 423 | hatkirby::progress ppgs("Generating black middle puzzles...", | ||
| 424 | forms_.size()); | ||
| 425 | |||
| 426 | for (Form& form : forms_) { | ||
| 427 | ppgs.update(); | ||
| 428 | |||
| 429 | std::string reversed_text = form.text; | ||
| 430 | std::reverse(reversed_text.begin(), reversed_text.end()); | ||
| 431 | |||
| 432 | if (form_by_text_.count(reversed_text)) { | ||
| 433 | form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text)); | ||
| 434 | } | ||
| 435 | } | ||
| 436 | } | ||
| 437 | |||
| 438 | // Count up all of the generated puzzles. | ||
| 439 | int total_puzzles = 0; | ||
| 440 | int reusable_words = 0; | ||
| 441 | std::unordered_map<PuzzleType, int> per_puzzle_type; | ||
| 442 | for (const Form& form : forms_) { | ||
| 443 | for (const auto& [puzzle_type, puzzles] : form.puzzles) { | ||
| 444 | total_puzzles += puzzles.size(); | ||
| 445 | per_puzzle_type[puzzle_type]++; | ||
| 446 | } | ||
| 447 | if (form.puzzles.size() > 1) { | ||
| 448 | reusable_words++; | ||
| 449 | } | ||
| 450 | } | ||
| 451 | std::cout << "Puzzles: " << total_puzzles << std::endl; | ||
| 452 | std::cout << "Reusable words: " << reusable_words << std::endl; | ||
| 453 | std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl; | ||
| 454 | std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl; | ||
| 455 | std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl; | ||
| 456 | std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle] | ||
| 457 | << std::endl; | ||
| 458 | std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl; | ||
| 459 | std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl; | ||
| 460 | } | ||
| 461 | |||
| 462 | size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) { | ||
| 463 | if (pronunciation_by_phonemes_.count(phonemes)) { | ||
| 464 | return pronunciation_by_phonemes_[phonemes]; | ||
| 465 | } else { | ||
| 466 | size_t pronunciation_id = pronunciations_.size(); | ||
| 467 | |||
| 468 | auto phonemeList = hatkirby::split<std::list<std::string>>(phonemes, " "); | ||
| 469 | |||
| 470 | std::list<std::string>::iterator rhymeStart = | ||
| 471 | std::find_if(std::begin(phonemeList), std::end(phonemeList), | ||
| 472 | [](std::string phoneme) { | ||
| 473 | return phoneme.find("1") != std::string::npos; | ||
| 474 | }); | ||
| 475 | |||
| 476 | // Rhyme detection | ||
| 477 | std::string prerhyme = ""; | ||
| 478 | std::string rhyme = ""; | ||
| 479 | if (rhymeStart != std::end(phonemeList)) { | ||
| 480 | std::list<std::string> rhymePhonemes; | ||
| 481 | |||
| 482 | std::transform( | ||
| 483 | rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), | ||
| 484 | [](std::string phoneme) { | ||
| 485 | std::string naked; | ||
| 486 | |||
| 487 | std::remove_copy_if(std::begin(phoneme), std::end(phoneme), | ||
| 488 | std::back_inserter(naked), | ||
| 489 | [](char ch) { return std::isdigit(ch); }); | ||
| 490 | |||
| 491 | return naked; | ||
| 492 | }); | ||
| 493 | |||
| 494 | rhyme = hatkirby::implode(std::begin(rhymePhonemes), | ||
| 495 | std::end(rhymePhonemes), " "); | ||
| 496 | |||
| 497 | if (rhymeStart != std::begin(phonemeList)) { | ||
| 498 | prerhyme = *std::prev(rhymeStart); | ||
| 499 | } | ||
| 500 | |||
| 501 | pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id); | ||
| 502 | } | ||
| 503 | |||
| 504 | std::string stressless; | ||
| 505 | for (int i = 0; i < phonemes.size(); i++) { | ||
| 506 | if (!std::isdigit(phonemes[i])) { | ||
| 507 | stressless.push_back(phonemes[i]); | ||
| 508 | } | ||
| 509 | } | ||
| 510 | auto stresslessList = | ||
| 511 | hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
| 512 | std::string stresslessPhonemes = | ||
| 513 | hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
| 514 | std::sort(stresslessList.begin(), stresslessList.end()); | ||
| 515 | std::string sortedPhonemes = | ||
| 516 | hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
| 517 | |||
| 518 | pronunciations_.push_back({.id = pronunciation_id, | ||
| 519 | .phonemes = phonemes, | ||
| 520 | .prerhyme = prerhyme, | ||
| 521 | .rhyme = rhyme, | ||
| 522 | .stressless_phonemes = stresslessPhonemes}); | ||
| 523 | |||
| 524 | AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes); | ||
| 525 | |||
| 526 | pronunciation_by_phonemes_[phonemes] = pronunciation_id; | ||
| 527 | pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back( | ||
| 528 | pronunciation_id); | ||
| 529 | |||
| 530 | return pronunciation_id; | ||
| 531 | } | ||
| 532 | } | ||
| 533 | |||
| 534 | size_t generator::LookupOrCreateForm(const std::string& word) { | ||
| 535 | if (form_by_text_.count(word)) { | ||
| 536 | return form_by_text_[word]; | ||
| 537 | } else { | ||
| 538 | size_t form_id = forms_.size(); | ||
| 539 | form_by_text_[word] = form_id; | ||
| 540 | forms_.push_back({.id = form_id, .text = word}); | ||
| 541 | |||
| 542 | std::string sortedText = word; | ||
| 543 | std::sort(sortedText.begin(), sortedText.end()); | ||
| 544 | AddFormToAnagramSet(form_id, sortedText); | ||
| 545 | |||
| 546 | return form_id; | ||
| 547 | } | ||
| 548 | } | ||
| 549 | |||
| 550 | size_t generator::LookupOrCreateWord(const std::string& word) { | ||
| 551 | if (word_by_base_.count(word)) { | ||
| 552 | return word_by_base_[word]; | ||
| 553 | } else { | ||
| 554 | size_t word_id = words_.size(); | ||
| 555 | word_by_base_[word] = words_.size(); | ||
| 556 | size_t form_id = LookupOrCreateForm(word); | ||
| 557 | words_.push_back({.id = word_id, .base_form_id = form_id}); | ||
| 558 | AddFormToWord(form_id, word_id); | ||
| 559 | return word_id; | ||
| 560 | } | ||
| 561 | } | ||
| 562 | |||
| 563 | void generator::AddPronunciationToForm(size_t pronunciation_id, | ||
| 564 | size_t form_id) { | ||
| 565 | pronunciations_[pronunciation_id].form_ids.push_back(form_id); | ||
| 566 | forms_[form_id].pronunciation_ids.push_back(pronunciation_id); | ||
| 567 | } | ||
| 568 | |||
| 569 | void generator::AddFormToWord(size_t form_id, size_t word_id) { | ||
| 570 | words_[word_id].form_ids.push_back(form_id); | ||
| 571 | forms_[form_id].word_ids.push_back(word_id); | ||
| 572 | } | ||
| 573 | |||
| 574 | void generator::AddWordToSynset(size_t word_id, int wnid) { | ||
| 575 | if (!synset_by_wnid_.count(wnid)) { | ||
| 576 | synset_by_wnid_[wnid] = synsets_.size(); | ||
| 577 | synsets_.push_back({word_id}); | ||
| 578 | words_[word_id].synsets.push_back(synsets_.size() - 1); | ||
| 579 | } else { | ||
| 580 | size_t synset_id = synset_by_wnid_[wnid]; | ||
| 581 | synsets_[synset_id].push_back(word_id); | ||
| 582 | words_[word_id].synsets.push_back(synset_id); | ||
| 583 | } | ||
| 584 | } | ||
| 585 | |||
| 586 | void generator::AddFormToAnagramSet(size_t form_id, | ||
| 587 | const std::string& sorted_letters) { | ||
| 588 | if (!anagram_set_by_sorted_letters_.count(sorted_letters)) { | ||
| 589 | anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size(); | ||
| 590 | anagram_sets_.push_back({form_id}); | ||
| 591 | forms_[form_id].anagram_set_id = anagram_sets_.size() - 1; | ||
| 592 | } else { | ||
| 593 | size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters]; | ||
| 594 | anagram_sets_[anagram_set_id].push_back(form_id); | ||
| 595 | forms_[form_id].anagram_set_id = anagram_set_id; | ||
| 596 | } | ||
| 597 | } | ||
| 598 | |||
| 599 | void generator::AddPronunciationToAnaphoneSet( | ||
| 600 | size_t pronunciation_id, const std::string& sorted_phonemes) { | ||
| 601 | if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) { | ||
| 602 | anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size(); | ||
| 603 | anaphone_sets_.push_back({pronunciation_id}); | ||
| 604 | pronunciations_[pronunciation_id].anaphone_set_id = | ||
| 605 | anaphone_sets_.size() - 1; | ||
| 606 | } else { | ||
| 607 | size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes]; | ||
| 608 | anaphone_sets_[anaphone_set_id].push_back(pronunciation_id); | ||
| 609 | pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id; | ||
| 610 | } | ||
| 611 | } | ||
| diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..a97b0b0 --- /dev/null +++ b/generator/generator.h | |||
| @@ -0,0 +1,114 @@ | |||
| 1 | #ifndef GENERATOR_H_D5C6A724 | ||
| 2 | #define GENERATOR_H_D5C6A724 | ||
| 3 | |||
| 4 | #include <optional> | ||
| 5 | #include <set> | ||
| 6 | #include <string> | ||
| 7 | #include <unordered_map> | ||
| 8 | #include <vector> | ||
| 9 | |||
| 10 | enum PuzzleType { | ||
| 11 | kWhiteTop = 0, | ||
| 12 | kWhiteBottom = 1, | ||
| 13 | kYellowTop = 2, | ||
| 14 | kYellowMiddle = 3, | ||
| 15 | kBlackTop = 4, | ||
| 16 | kBlackMiddle = 5, | ||
| 17 | }; | ||
| 18 | |||
| 19 | class generator { | ||
| 20 | public: | ||
| 21 | // Constructor | ||
| 22 | |||
| 23 | generator(std::string agidPath, std::string wordNetPath, | ||
| 24 | std::string cmudictPath, std::string wordfreqPath, | ||
| 25 | std::string outputPath); | ||
| 26 | |||
| 27 | // Action | ||
| 28 | |||
| 29 | void run(); | ||
| 30 | |||
| 31 | private: | ||
| 32 | // Helpers | ||
| 33 | |||
| 34 | size_t LookupOrCreatePronunciation(const std::string& phonemes); | ||
| 35 | |||
| 36 | size_t LookupOrCreateForm(const std::string& form); | ||
| 37 | |||
| 38 | size_t LookupOrCreateWord(const std::string& word); | ||
| 39 | |||
| 40 | void AddPronunciationToForm(size_t pronunciation_id, size_t form_id); | ||
| 41 | |||
| 42 | void AddFormToWord(size_t form_id, size_t word_id); | ||
| 43 | |||
| 44 | void AddWordToSynset(size_t word_id, int wnid); | ||
| 45 | |||
| 46 | void AddFormToAnagramSet(size_t form_id, const std::string& sorted_letters); | ||
| 47 | |||
| 48 | void AddPronunciationToAnaphoneSet(size_t pronunciation_id, | ||
| 49 | const std::string& sorted_phonemes); | ||
| 50 | |||
| 51 | // Input | ||
| 52 | |||
| 53 | std::string agidPath_; | ||
| 54 | std::string wordNetPath_; | ||
| 55 | std::string cmudictPath_; | ||
| 56 | std::string wordfreqPath_; | ||
| 57 | |||
| 58 | // Output | ||
| 59 | |||
| 60 | std::string outputPath_; | ||
| 61 | |||
| 62 | // Indexes | ||
| 63 | |||
| 64 | struct Pronunciation { | ||
| 65 | size_t id; | ||
| 66 | std::string phonemes; | ||
| 67 | std::string prerhyme; | ||
| 68 | std::string rhyme; | ||
| 69 | std::vector<size_t> form_ids; | ||
| 70 | std::optional<size_t> anaphone_set_id; | ||
| 71 | std::string stressless_phonemes; | ||
| 72 | }; | ||
| 73 | |||
| 74 | struct Form { | ||
| 75 | size_t id; | ||
| 76 | std::string text; | ||
| 77 | std::vector<size_t> word_ids; | ||
| 78 | std::vector<size_t> pronunciation_ids; | ||
| 79 | std::optional<size_t> anagram_set_id; | ||
| 80 | std::optional<size_t> reverse_form_id; | ||
| 81 | |||
| 82 | std::unordered_map<PuzzleType, std::set<size_t>> puzzles; | ||
| 83 | }; | ||
| 84 | |||
| 85 | struct Word { | ||
| 86 | size_t id; | ||
| 87 | size_t base_form_id; | ||
| 88 | std::vector<size_t> form_ids; | ||
| 89 | std::vector<size_t> synsets; | ||
| 90 | }; | ||
| 91 | |||
| 92 | std::vector<Pronunciation> pronunciations_; | ||
| 93 | std::unordered_map<std::string, size_t> pronunciation_by_phonemes_; | ||
| 94 | std::unordered_map<std::string, std::vector<size_t>> pronunciations_by_rhyme_; | ||
| 95 | std::unordered_map<std::string, std::vector<size_t>> | ||
| 96 | pronunciations_by_blank_phonemes_; | ||
| 97 | |||
| 98 | std::vector<std::vector<size_t>> anaphone_sets_; | ||
| 99 | std::unordered_map<std::string, size_t> anaphone_set_by_sorted_phonemes_; | ||
| 100 | |||
| 101 | std::vector<Form> forms_; | ||
| 102 | std::unordered_map<std::string, size_t> form_by_text_; | ||
| 103 | |||
| 104 | std::vector<std::vector<size_t>> anagram_sets_; | ||
| 105 | std::unordered_map<std::string, size_t> anagram_set_by_sorted_letters_; | ||
| 106 | |||
| 107 | std::vector<Word> words_; | ||
| 108 | std::unordered_map<std::string, size_t> word_by_base_; | ||
| 109 | |||
| 110 | std::vector<std::vector<size_t>> synsets_; | ||
| 111 | std::unordered_map<int, size_t> synset_by_wnid_; | ||
| 112 | }; | ||
| 113 | |||
| 114 | #endif /* end of include guard: GENERATOR_H_D5C6A724 */ \ No newline at end of file | ||
| diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..c958421 --- /dev/null +++ b/generator/main.cpp | |||
| @@ -0,0 +1,35 @@ | |||
| 1 | #include <exception> | ||
| 2 | #include <iostream> | ||
| 3 | |||
| 4 | #include "generator.h" | ||
| 5 | |||
| 6 | void printUsage() { | ||
| 7 | std::cout << "usage: generator agid wordnet cmudict wordfreq output" | ||
| 8 | << std::endl; | ||
| 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | ||
| 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" | ||
| 11 | << std::endl; | ||
| 12 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | ||
| 13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; | ||
| 14 | std::cout << "output :: datafile output path" << std::endl; | ||
| 15 | } | ||
| 16 | |||
| 17 | int main(int argc, char** argv) { | ||
| 18 | if (argc == 6) { | ||
| 19 | try { | ||
| 20 | generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); | ||
| 21 | |||
| 22 | try { | ||
| 23 | app.run(); | ||
| 24 | } catch (const std::exception& e) { | ||
| 25 | std::cout << e.what() << std::endl; | ||
| 26 | } | ||
| 27 | } catch (const std::exception& e) { | ||
| 28 | std::cout << e.what() << std::endl; | ||
| 29 | printUsage(); | ||
| 30 | } | ||
| 31 | } else { | ||
| 32 | std::cout << "lingo randomizer generator" << std::endl; | ||
| 33 | printUsage(); | ||
| 34 | } | ||
| 35 | } | ||
