summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp611
1 files changed, 611 insertions, 0 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp new file mode 100644 index 0000000..7ab69b5 --- /dev/null +++ b/generator/generator.cpp
@@ -0,0 +1,611 @@
1#include "generator.h"
2
3#include <hkutil/progress.h>
4#include <hkutil/string.h>
5
6#include <algorithm>
7#include <fstream>
8#include <list>
9#include <regex>
10#include <set>
11#include <stdexcept>
12#include <string>
13#include <unordered_map>
14#include <vector>
15
16constexpr int MIN_FREQUENCY = 2000000;
17
18namespace {
19
20std::list<std::string> readFile(std::string path, bool uniq = false) {
21 std::ifstream file(path);
22 if (!file) {
23 throw std::invalid_argument("Could not find file " + path);
24 }
25
26 std::list<std::string> lines;
27 std::string line;
28 while (std::getline(file, line)) {
29 if (line.back() == '\r') {
30 line.pop_back();
31 }
32
33 lines.push_back(line);
34 }
35
36 if (uniq) {
37 std::vector<std::string> uniq(std::begin(lines), std::end(lines));
38 lines.clear();
39
40 std::sort(std::begin(uniq), std::end(uniq));
41 std::unique_copy(std::begin(uniq), std::end(uniq),
42 std::back_inserter(lines));
43 }
44
45 return lines;
46}
47
48} // namespace
49
50generator::generator(std::string agidPath, std::string wordNetPath,
51 std::string cmudictPath, std::string wordfreqPath,
52 std::string outputPath)
53 : agidPath_(agidPath),
54 wordNetPath_(wordNetPath),
55 cmudictPath_(cmudictPath),
56 wordfreqPath_(wordfreqPath),
57 outputPath_(outputPath) {
58 // Ensure AGID infl.txt exists
59 if (!std::ifstream(agidPath_)) {
60 throw std::invalid_argument("AGID infl.txt file not found");
61 }
62
63 // Add directory separator to WordNet path
64 if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) {
65 wordNetPath_ += '/';
66 }
67
68 // Ensure WordNet tables exist
69 for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm",
70 "mp", "ms", "per", "sa", "sim", "syntax"}) {
71 if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) {
72 throw std::invalid_argument("WordNet " + table + " table not found");
73 }
74 }
75
76 // Ensure CMUDICT file exists
77 if (!std::ifstream(cmudictPath_)) {
78 throw std::invalid_argument("CMUDICT file not found");
79 }
80}
81
82void generator::run() {
83 std::unordered_map<std::string, int> word_frequencies;
84 {
85 std::list<std::string> lines(readFile(wordfreqPath_));
86
87 hatkirby::progress ppgs("Reading word frequencies...", lines.size());
88
89 for (std::string line : lines) {
90 ppgs.update();
91
92 std::regex freqline("([a-z]+),([0-9]+)");
93 std::smatch freqline_data;
94 if (std::regex_search(line, freqline_data, freqline)) {
95 std::string text = freqline_data[1];
96 std::string freqnumstr = freqline_data[2];
97 long long freqnumnum = std::atoll(freqnumstr.c_str());
98 word_frequencies[text] = freqnumnum > std::numeric_limits<int>::max()
99 ? std::numeric_limits<int>::max()
100 : freqnumnum;
101 }
102 }
103 }
104
105 {
106 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
107 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
108
109 std::set<std::pair<int, int>> wnid_and_wnum;
110 for (std::string line : lines) {
111 ppgs.update();
112
113 std::regex relation(
114 "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
115
116 std::smatch relation_data;
117 if (!std::regex_search(line, relation_data, relation)) {
118 continue;
119 }
120
121 int synset_id = std::stoi(relation_data[1]);
122 int wnum = std::stoi(relation_data[2]);
123 std::string text = relation_data[3];
124 int tag_count = std::stoi(relation_data[4]);
125 size_t word_it;
126 while ((word_it = text.find("''")) != std::string::npos) {
127 text.erase(word_it, 1);
128 }
129
130 // The word must be common enough.
131 if (word_frequencies[text] < MIN_FREQUENCY) {
132 continue;
133 }
134
135 // We are looking for single words.
136 if (std::count(std::begin(text), std::end(text), ' ') > 0) {
137 continue;
138 }
139
140 // This should filter our proper nouns.
141 if (std::any_of(std::begin(text), std::end(text), ::isupper)) {
142 continue;
143 }
144
145 // The WordNet data does contain duplicates, so we need to check that we
146 // haven't already created this word.
147 std::pair<int, int> lookup(synset_id, wnum);
148 if (wnid_and_wnum.count(lookup)) {
149 continue;
150 }
151
152 wnid_and_wnum.insert(lookup);
153
154 size_t word_id = LookupOrCreateWord(text);
155 AddWordToSynset(word_id, synset_id);
156 }
157 }
158
159 {
160 std::list<std::string> lines(readFile(agidPath_));
161 hatkirby::progress ppgs("Reading inflections from AGID...", lines.size());
162
163 for (std::string line : lines) {
164 ppgs.update();
165
166 int divider = line.find_first_of(" ");
167 std::string infinitive = line.substr(0, divider);
168 line = line.substr(divider + 1);
169 char type = line[0];
170
171 if (line[1] == '?') {
172 line.erase(0, 4);
173 } else {
174 line.erase(0, 3);
175 }
176
177 if (!word_by_base_.count(infinitive) &&
178 !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) {
179 continue;
180 }
181
182 size_t word_id = LookupOrCreateWord(infinitive);
183
184 auto inflWordList = hatkirby::split<std::list<std::string>>(line, " | ");
185
186 std::vector<std::list<std::string>> agidForms;
187 for (std::string inflForms : inflWordList) {
188 auto inflFormList =
189 hatkirby::split<std::list<std::string>>(std::move(inflForms), ", ");
190
191 std::list<std::string> forms;
192 for (std::string inflForm : inflFormList) {
193 int sympos = inflForm.find_first_of("~<!? ");
194 if (sympos != std::string::npos) {
195 inflForm = inflForm.substr(0, sympos);
196 }
197
198 forms.push_back(std::move(inflForm));
199 }
200
201 agidForms.push_back(std::move(forms));
202 }
203
204 std::vector<std::list<std::string>> inflections;
205 switch (type) {
206 case 'V': {
207 if (agidForms.size() == 4) {
208 inflections.push_back(agidForms[0]);
209 inflections.push_back(agidForms[1]);
210 inflections.push_back(agidForms[2]);
211 inflections.push_back(agidForms[3]);
212 } else if (agidForms.size() == 3) {
213 inflections.push_back(agidForms[0]);
214 inflections.push_back(agidForms[1]);
215 inflections.push_back(agidForms[2]);
216 } else if (agidForms.size() == 8) {
217 // As of AGID 2014.08.11, this is only "to be"
218 inflections.push_back(agidForms[0]);
219 inflections.push_back(agidForms[2]);
220 inflections.push_back(agidForms[3]);
221 inflections.push_back(agidForms[4]);
222 } else {
223 // Words that don't fit the cases above as of AGID 2014.08.11:
224 // - may and shall do not conjugate the way we want them to
225 // - methinks only has a past tense and is an outlier
226 // - wit has five forms, and is archaic/obscure enough that we can
227 // ignore it for now
228 std::cout << " Ignoring verb \"" << infinitive
229 << "\" due to non-standard number of forms." << std::endl;
230 }
231
232 break;
233 }
234
235 case 'A': {
236 if (agidForms.size() == 2) {
237 inflections.push_back(agidForms[0]);
238 inflections.push_back(agidForms[1]);
239 } else {
240 // As of AGID 2014.08.11, this is only "only", which has only the
241 // form "onliest"
242 std::cout << " Ignoring adjective/adverb \"" << infinitive
243 << "\" due to non-standard number of forms." << std::endl;
244 }
245
246 break;
247 }
248
249 case 'N': {
250 if (agidForms.size() == 1) {
251 inflections.push_back(agidForms[0]);
252 } else {
253 // As of AGID 2014.08.11, this is non-existent.
254 std::cout << " Ignoring noun \"" << infinitive
255 << "\" due to non-standard number of forms." << std::endl;
256 }
257
258 break;
259 }
260 }
261
262 // Compile the forms we have mapped.
263 for (const std::list<std::string>& infl_list : inflections) {
264 for (const std::string& infl : infl_list) {
265 size_t form_id = LookupOrCreateForm(infl);
266 AddFormToWord(form_id, word_id);
267 }
268 }
269 }
270 }
271
272 word_frequencies.clear(); // Not needed anymore.
273
274 {
275 std::list<std::string> lines(readFile(cmudictPath_));
276
277 hatkirby::progress ppgs("Reading pronunciations from CMUDICT...",
278 lines.size());
279
280 for (std::string line : lines) {
281 ppgs.update();
282
283 std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)");
284 std::smatch phoneme_data;
285 if (std::regex_search(line, phoneme_data, phoneme)) {
286 std::string canonical = hatkirby::lowercase(phoneme_data[1]);
287
288 if (!form_by_text_.count(canonical)) {
289 continue;
290 }
291
292 std::string phonemes = phoneme_data[2];
293 size_t pronunciation_id = LookupOrCreatePronunciation(phonemes);
294 AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]);
295 }
296 }
297 }
298
299 std::cout << "Words: " << words_.size() << std::endl;
300 std::cout << "Forms: " << forms_.size() << std::endl;
301 std::cout << "Pronunciations: " << pronunciations_.size() << std::endl;
302
303 // White Top
304 {
305 hatkirby::progress ppgs("Generating white top puzzles...", forms_.size());
306
307 for (Form& form : forms_) {
308 ppgs.update();
309
310 for (size_t p_id : form.pronunciation_ids) {
311 const Pronunciation& pronunciation = pronunciations_.at(p_id);
312 for (size_t other_form_id : pronunciation.form_ids) {
313 if (other_form_id != form.id) {
314 form.puzzles[kWhiteTop].insert(other_form_id);
315 }
316 }
317 }
318 }
319 }
320
321 // White Bottom
322 {
323 hatkirby::progress ppgs("Generating white bottom puzzles...",
324 words_.size());
325
326 for (const Word& word : words_) {
327 ppgs.update();
328
329 Form& form = forms_.at(word.base_form_id);
330 for (size_t synset_id : word.synsets) {
331 for (size_t other_word_id : synsets_.at(synset_id)) {
332 if (other_word_id != word.id) {
333 const Word& other_word = words_.at(other_word_id);
334 form.puzzles[kWhiteBottom].insert(other_word.base_form_id);
335 }
336 }
337 }
338 }
339 }
340
341 // Yellow Top
342 {
343 hatkirby::progress ppgs("Generating yellow top puzzles...",
344 anaphone_sets_.size());
345
346 for (const std::vector<size_t>& anaphone_set : anaphone_sets_) {
347 ppgs.update();
348
349 std::set<size_t> all_forms;
350 for (size_t p_id : anaphone_set) {
351 const Pronunciation& pronunciation = pronunciations_.at(p_id);
352 for (size_t form_id : pronunciation.form_ids) {
353 all_forms.insert(form_id);
354 }
355 }
356
357 for (size_t f_id1 : all_forms) {
358 for (size_t f_id2 : all_forms) {
359 if (f_id1 != f_id2) {
360 Form& form = forms_.at(f_id1);
361 form.puzzles[kYellowTop].insert(f_id2);
362 }
363 }
364 }
365 }
366 }
367
368 // Yellow Middle
369 {
370 hatkirby::progress ppgs("Generating yellow middle puzzles...",
371 anagram_sets_.size());
372
373 for (const std::vector<size_t>& anagram_set : anagram_sets_) {
374 ppgs.update();
375
376 for (size_t f_id1 : anagram_set) {
377 for (size_t f_id2 : anagram_set) {
378 if (f_id1 != f_id2) {
379 Form& form = forms_.at(f_id1);
380 form.puzzles[kYellowMiddle].insert(f_id2);
381 }
382 }
383 }
384 }
385 }
386
387 // Black Top
388 {
389 hatkirby::progress ppgs("Generating black top puzzles...",
390 pronunciations_.size());
391
392 for (const Pronunciation& pronunciation : pronunciations_) {
393 ppgs.update();
394
395 auto reversed_list = hatkirby::split<std::vector<std::string>>(
396 pronunciation.stressless_phonemes, " ");
397 std::reverse(reversed_list.begin(), reversed_list.end());
398 std::string reversed_phonemes =
399 hatkirby::implode(reversed_list.begin(), reversed_list.end(), " ");
400 if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) {
401 std::set<size_t> all_forms;
402
403 for (size_t p_id :
404 pronunciations_by_blank_phonemes_.at(reversed_phonemes)) {
405 const Pronunciation& other_pronunciation = pronunciations_.at(p_id);
406 for (size_t form_id : other_pronunciation.form_ids) {
407 all_forms.insert(form_id);
408 }
409 }
410
411 for (size_t f_id1 : pronunciation.form_ids) {
412 for (size_t f_id2 : all_forms) {
413 Form& form = forms_.at(f_id1);
414 form.puzzles[kBlackTop].insert(f_id2);
415 }
416 }
417 }
418 }
419 }
420
421 // Black Middle
422 {
423 hatkirby::progress ppgs("Generating black middle puzzles...",
424 forms_.size());
425
426 for (Form& form : forms_) {
427 ppgs.update();
428
429 std::string reversed_text = form.text;
430 std::reverse(reversed_text.begin(), reversed_text.end());
431
432 if (form_by_text_.count(reversed_text)) {
433 form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text));
434 }
435 }
436 }
437
438 // Count up all of the generated puzzles.
439 int total_puzzles = 0;
440 int reusable_words = 0;
441 std::unordered_map<PuzzleType, int> per_puzzle_type;
442 for (const Form& form : forms_) {
443 for (const auto& [puzzle_type, puzzles] : form.puzzles) {
444 total_puzzles += puzzles.size();
445 per_puzzle_type[puzzle_type]++;
446 }
447 if (form.puzzles.size() > 1) {
448 reusable_words++;
449 }
450 }
451 std::cout << "Puzzles: " << total_puzzles << std::endl;
452 std::cout << "Reusable words: " << reusable_words << std::endl;
453 std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl;
454 std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl;
455 std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl;
456 std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle]
457 << std::endl;
458 std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl;
459 std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl;
460}
461
462size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) {
463 if (pronunciation_by_phonemes_.count(phonemes)) {
464 return pronunciation_by_phonemes_[phonemes];
465 } else {
466 size_t pronunciation_id = pronunciations_.size();
467
468 auto phonemeList = hatkirby::split<std::list<std::string>>(phonemes, " ");
469
470 std::list<std::string>::iterator rhymeStart =
471 std::find_if(std::begin(phonemeList), std::end(phonemeList),
472 [](std::string phoneme) {
473 return phoneme.find("1") != std::string::npos;
474 });
475
476 // Rhyme detection
477 std::string prerhyme = "";
478 std::string rhyme = "";
479 if (rhymeStart != std::end(phonemeList)) {
480 std::list<std::string> rhymePhonemes;
481
482 std::transform(
483 rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes),
484 [](std::string phoneme) {
485 std::string naked;
486
487 std::remove_copy_if(std::begin(phoneme), std::end(phoneme),
488 std::back_inserter(naked),
489 [](char ch) { return std::isdigit(ch); });
490
491 return naked;
492 });
493
494 rhyme = hatkirby::implode(std::begin(rhymePhonemes),
495 std::end(rhymePhonemes), " ");
496
497 if (rhymeStart != std::begin(phonemeList)) {
498 prerhyme = *std::prev(rhymeStart);
499 }
500
501 pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id);
502 }
503
504 std::string stressless;
505 for (int i = 0; i < phonemes.size(); i++) {
506 if (!std::isdigit(phonemes[i])) {
507 stressless.push_back(phonemes[i]);
508 }
509 }
510 auto stresslessList =
511 hatkirby::split<std::vector<std::string>>(stressless, " ");
512 std::string stresslessPhonemes =
513 hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
514 std::sort(stresslessList.begin(), stresslessList.end());
515 std::string sortedPhonemes =
516 hatkirby::implode(stresslessList.begin(), stresslessList.end(), " ");
517
518 pronunciations_.push_back({.id = pronunciation_id,
519 .phonemes = phonemes,
520 .prerhyme = prerhyme,
521 .rhyme = rhyme,
522 .stressless_phonemes = stresslessPhonemes});
523
524 AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes);
525
526 pronunciation_by_phonemes_[phonemes] = pronunciation_id;
527 pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back(
528 pronunciation_id);
529
530 return pronunciation_id;
531 }
532}
533
534size_t generator::LookupOrCreateForm(const std::string& word) {
535 if (form_by_text_.count(word)) {
536 return form_by_text_[word];
537 } else {
538 size_t form_id = forms_.size();
539 form_by_text_[word] = form_id;
540 forms_.push_back({.id = form_id, .text = word});
541
542 std::string sortedText = word;
543 std::sort(sortedText.begin(), sortedText.end());
544 AddFormToAnagramSet(form_id, sortedText);
545
546 return form_id;
547 }
548}
549
550size_t generator::LookupOrCreateWord(const std::string& word) {
551 if (word_by_base_.count(word)) {
552 return word_by_base_[word];
553 } else {
554 size_t word_id = words_.size();
555 word_by_base_[word] = words_.size();
556 size_t form_id = LookupOrCreateForm(word);
557 words_.push_back({.id = word_id, .base_form_id = form_id});
558 AddFormToWord(form_id, word_id);
559 return word_id;
560 }
561}
562
563void generator::AddPronunciationToForm(size_t pronunciation_id,
564 size_t form_id) {
565 pronunciations_[pronunciation_id].form_ids.push_back(form_id);
566 forms_[form_id].pronunciation_ids.push_back(pronunciation_id);
567}
568
569void generator::AddFormToWord(size_t form_id, size_t word_id) {
570 words_[word_id].form_ids.push_back(form_id);
571 forms_[form_id].word_ids.push_back(word_id);
572}
573
574void generator::AddWordToSynset(size_t word_id, int wnid) {
575 if (!synset_by_wnid_.count(wnid)) {
576 synset_by_wnid_[wnid] = synsets_.size();
577 synsets_.push_back({word_id});
578 words_[word_id].synsets.push_back(synsets_.size() - 1);
579 } else {
580 size_t synset_id = synset_by_wnid_[wnid];
581 synsets_[synset_id].push_back(word_id);
582 words_[word_id].synsets.push_back(synset_id);
583 }
584}
585
586void generator::AddFormToAnagramSet(size_t form_id,
587 const std::string& sorted_letters) {
588 if (!anagram_set_by_sorted_letters_.count(sorted_letters)) {
589 anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size();
590 anagram_sets_.push_back({form_id});
591 forms_[form_id].anagram_set_id = anagram_sets_.size() - 1;
592 } else {
593 size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters];
594 anagram_sets_[anagram_set_id].push_back(form_id);
595 forms_[form_id].anagram_set_id = anagram_set_id;
596 }
597}
598
599void generator::AddPronunciationToAnaphoneSet(
600 size_t pronunciation_id, const std::string& sorted_phonemes) {
601 if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) {
602 anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size();
603 anaphone_sets_.push_back({pronunciation_id});
604 pronunciations_[pronunciation_id].anaphone_set_id =
605 anaphone_sets_.size() - 1;
606 } else {
607 size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes];
608 anaphone_sets_[anaphone_set_id].push_back(pronunciation_id);
609 pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id;
610 }
611}