diff options
author | Star Rauchenberger <fefferburbia@gmail.com> | 2023-12-02 17:03:54 -0500 |
---|---|---|
committer | Star Rauchenberger <fefferburbia@gmail.com> | 2023-12-02 17:03:54 -0500 |
commit | 90fbd47ca02f1a723134302ca978a7f9ef0eac04 (patch) | |
tree | 759a8d906ae903dbe088e078ac7bd71a080fdad5 /generator | |
parent | 691e37d46f55c36d85f82d261f002bb03b82f11b (diff) | |
download | lingo-randomizer-90fbd47ca02f1a723134302ca978a7f9ef0eac04.tar.gz lingo-randomizer-90fbd47ca02f1a723134302ca978a7f9ef0eac04.tar.bz2 lingo-randomizer-90fbd47ca02f1a723134302ca978a7f9ef0eac04.zip |
Starting to get some puzzles randomized
Diffstat (limited to 'generator')
-rw-r--r-- | generator/.clang-format | 2 | ||||
-rw-r--r-- | generator/CMakeLists.txt | 9 | ||||
-rw-r--r-- | generator/generator.cpp | 611 | ||||
-rw-r--r-- | generator/generator.h | 114 | ||||
-rw-r--r-- | generator/main.cpp | 35 |
5 files changed, 771 insertions, 0 deletions
diff --git a/generator/.clang-format b/generator/.clang-format new file mode 100644 index 0000000..8de7fe6 --- /dev/null +++ b/generator/.clang-format | |||
@@ -0,0 +1,2 @@ | |||
1 | --- | ||
2 | BasedOnStyle: Google | ||
diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt new file mode 100644 index 0000000..343e5cb --- /dev/null +++ b/generator/CMakeLists.txt | |||
@@ -0,0 +1,9 @@ | |||
1 | cmake_minimum_required (VERSION 3.1) | ||
2 | project (generator) | ||
3 | |||
4 | include_directories(../vendor/hkutil) | ||
5 | |||
6 | add_executable(generator generator.cpp main.cpp) | ||
7 | set_property(TARGET generator PROPERTY CXX_STANDARD 17) | ||
8 | set_property(TARGET generator PROPERTY CXX_STANDARD_REQUIRED ON) | ||
9 | #target_link_libraries(generator ${sqlite3_LIBRARIES} ${LIBXML2_LIBRARIES}) | ||
diff --git a/generator/generator.cpp b/generator/generator.cpp new file mode 100644 index 0000000..7ab69b5 --- /dev/null +++ b/generator/generator.cpp | |||
@@ -0,0 +1,611 @@ | |||
1 | #include "generator.h" | ||
2 | |||
3 | #include <hkutil/progress.h> | ||
4 | #include <hkutil/string.h> | ||
5 | |||
6 | #include <algorithm> | ||
7 | #include <fstream> | ||
8 | #include <list> | ||
9 | #include <regex> | ||
10 | #include <set> | ||
11 | #include <stdexcept> | ||
12 | #include <string> | ||
13 | #include <unordered_map> | ||
14 | #include <vector> | ||
15 | |||
16 | constexpr int MIN_FREQUENCY = 2000000; | ||
17 | |||
18 | namespace { | ||
19 | |||
20 | std::list<std::string> readFile(std::string path, bool uniq = false) { | ||
21 | std::ifstream file(path); | ||
22 | if (!file) { | ||
23 | throw std::invalid_argument("Could not find file " + path); | ||
24 | } | ||
25 | |||
26 | std::list<std::string> lines; | ||
27 | std::string line; | ||
28 | while (std::getline(file, line)) { | ||
29 | if (line.back() == '\r') { | ||
30 | line.pop_back(); | ||
31 | } | ||
32 | |||
33 | lines.push_back(line); | ||
34 | } | ||
35 | |||
36 | if (uniq) { | ||
37 | std::vector<std::string> uniq(std::begin(lines), std::end(lines)); | ||
38 | lines.clear(); | ||
39 | |||
40 | std::sort(std::begin(uniq), std::end(uniq)); | ||
41 | std::unique_copy(std::begin(uniq), std::end(uniq), | ||
42 | std::back_inserter(lines)); | ||
43 | } | ||
44 | |||
45 | return lines; | ||
46 | } | ||
47 | |||
48 | } // namespace | ||
49 | |||
50 | generator::generator(std::string agidPath, std::string wordNetPath, | ||
51 | std::string cmudictPath, std::string wordfreqPath, | ||
52 | std::string outputPath) | ||
53 | : agidPath_(agidPath), | ||
54 | wordNetPath_(wordNetPath), | ||
55 | cmudictPath_(cmudictPath), | ||
56 | wordfreqPath_(wordfreqPath), | ||
57 | outputPath_(outputPath) { | ||
58 | // Ensure AGID infl.txt exists | ||
59 | if (!std::ifstream(agidPath_)) { | ||
60 | throw std::invalid_argument("AGID infl.txt file not found"); | ||
61 | } | ||
62 | |||
63 | // Add directory separator to WordNet path | ||
64 | if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\')) { | ||
65 | wordNetPath_ += '/'; | ||
66 | } | ||
67 | |||
68 | // Ensure WordNet tables exist | ||
69 | for (std::string table : {"s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", | ||
70 | "mp", "ms", "per", "sa", "sim", "syntax"}) { | ||
71 | if (!std::ifstream(wordNetPath_ + "wn_" + table + ".pl")) { | ||
72 | throw std::invalid_argument("WordNet " + table + " table not found"); | ||
73 | } | ||
74 | } | ||
75 | |||
76 | // Ensure CMUDICT file exists | ||
77 | if (!std::ifstream(cmudictPath_)) { | ||
78 | throw std::invalid_argument("CMUDICT file not found"); | ||
79 | } | ||
80 | } | ||
81 | |||
82 | void generator::run() { | ||
83 | std::unordered_map<std::string, int> word_frequencies; | ||
84 | { | ||
85 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
86 | |||
87 | hatkirby::progress ppgs("Reading word frequencies...", lines.size()); | ||
88 | |||
89 | for (std::string line : lines) { | ||
90 | ppgs.update(); | ||
91 | |||
92 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
93 | std::smatch freqline_data; | ||
94 | if (std::regex_search(line, freqline_data, freqline)) { | ||
95 | std::string text = freqline_data[1]; | ||
96 | std::string freqnumstr = freqline_data[2]; | ||
97 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
98 | word_frequencies[text] = freqnumnum > std::numeric_limits<int>::max() | ||
99 | ? std::numeric_limits<int>::max() | ||
100 | : freqnumnum; | ||
101 | } | ||
102 | } | ||
103 | } | ||
104 | |||
105 | { | ||
106 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | ||
107 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | ||
108 | |||
109 | std::set<std::pair<int, int>> wnid_and_wnum; | ||
110 | for (std::string line : lines) { | ||
111 | ppgs.update(); | ||
112 | |||
113 | std::regex relation( | ||
114 | "^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$"); | ||
115 | |||
116 | std::smatch relation_data; | ||
117 | if (!std::regex_search(line, relation_data, relation)) { | ||
118 | continue; | ||
119 | } | ||
120 | |||
121 | int synset_id = std::stoi(relation_data[1]); | ||
122 | int wnum = std::stoi(relation_data[2]); | ||
123 | std::string text = relation_data[3]; | ||
124 | int tag_count = std::stoi(relation_data[4]); | ||
125 | size_t word_it; | ||
126 | while ((word_it = text.find("''")) != std::string::npos) { | ||
127 | text.erase(word_it, 1); | ||
128 | } | ||
129 | |||
130 | // The word must be common enough. | ||
131 | if (word_frequencies[text] < MIN_FREQUENCY) { | ||
132 | continue; | ||
133 | } | ||
134 | |||
135 | // We are looking for single words. | ||
136 | if (std::count(std::begin(text), std::end(text), ' ') > 0) { | ||
137 | continue; | ||
138 | } | ||
139 | |||
140 | // This should filter our proper nouns. | ||
141 | if (std::any_of(std::begin(text), std::end(text), ::isupper)) { | ||
142 | continue; | ||
143 | } | ||
144 | |||
145 | // The WordNet data does contain duplicates, so we need to check that we | ||
146 | // haven't already created this word. | ||
147 | std::pair<int, int> lookup(synset_id, wnum); | ||
148 | if (wnid_and_wnum.count(lookup)) { | ||
149 | continue; | ||
150 | } | ||
151 | |||
152 | wnid_and_wnum.insert(lookup); | ||
153 | |||
154 | size_t word_id = LookupOrCreateWord(text); | ||
155 | AddWordToSynset(word_id, synset_id); | ||
156 | } | ||
157 | } | ||
158 | |||
159 | { | ||
160 | std::list<std::string> lines(readFile(agidPath_)); | ||
161 | hatkirby::progress ppgs("Reading inflections from AGID...", lines.size()); | ||
162 | |||
163 | for (std::string line : lines) { | ||
164 | ppgs.update(); | ||
165 | |||
166 | int divider = line.find_first_of(" "); | ||
167 | std::string infinitive = line.substr(0, divider); | ||
168 | line = line.substr(divider + 1); | ||
169 | char type = line[0]; | ||
170 | |||
171 | if (line[1] == '?') { | ||
172 | line.erase(0, 4); | ||
173 | } else { | ||
174 | line.erase(0, 3); | ||
175 | } | ||
176 | |||
177 | if (!word_by_base_.count(infinitive) && | ||
178 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { | ||
179 | continue; | ||
180 | } | ||
181 | |||
182 | size_t word_id = LookupOrCreateWord(infinitive); | ||
183 | |||
184 | auto inflWordList = hatkirby::split<std::list<std::string>>(line, " | "); | ||
185 | |||
186 | std::vector<std::list<std::string>> agidForms; | ||
187 | for (std::string inflForms : inflWordList) { | ||
188 | auto inflFormList = | ||
189 | hatkirby::split<std::list<std::string>>(std::move(inflForms), ", "); | ||
190 | |||
191 | std::list<std::string> forms; | ||
192 | for (std::string inflForm : inflFormList) { | ||
193 | int sympos = inflForm.find_first_of("~<!? "); | ||
194 | if (sympos != std::string::npos) { | ||
195 | inflForm = inflForm.substr(0, sympos); | ||
196 | } | ||
197 | |||
198 | forms.push_back(std::move(inflForm)); | ||
199 | } | ||
200 | |||
201 | agidForms.push_back(std::move(forms)); | ||
202 | } | ||
203 | |||
204 | std::vector<std::list<std::string>> inflections; | ||
205 | switch (type) { | ||
206 | case 'V': { | ||
207 | if (agidForms.size() == 4) { | ||
208 | inflections.push_back(agidForms[0]); | ||
209 | inflections.push_back(agidForms[1]); | ||
210 | inflections.push_back(agidForms[2]); | ||
211 | inflections.push_back(agidForms[3]); | ||
212 | } else if (agidForms.size() == 3) { | ||
213 | inflections.push_back(agidForms[0]); | ||
214 | inflections.push_back(agidForms[1]); | ||
215 | inflections.push_back(agidForms[2]); | ||
216 | } else if (agidForms.size() == 8) { | ||
217 | // As of AGID 2014.08.11, this is only "to be" | ||
218 | inflections.push_back(agidForms[0]); | ||
219 | inflections.push_back(agidForms[2]); | ||
220 | inflections.push_back(agidForms[3]); | ||
221 | inflections.push_back(agidForms[4]); | ||
222 | } else { | ||
223 | // Words that don't fit the cases above as of AGID 2014.08.11: | ||
224 | // - may and shall do not conjugate the way we want them to | ||
225 | // - methinks only has a past tense and is an outlier | ||
226 | // - wit has five forms, and is archaic/obscure enough that we can | ||
227 | // ignore it for now | ||
228 | std::cout << " Ignoring verb \"" << infinitive | ||
229 | << "\" due to non-standard number of forms." << std::endl; | ||
230 | } | ||
231 | |||
232 | break; | ||
233 | } | ||
234 | |||
235 | case 'A': { | ||
236 | if (agidForms.size() == 2) { | ||
237 | inflections.push_back(agidForms[0]); | ||
238 | inflections.push_back(agidForms[1]); | ||
239 | } else { | ||
240 | // As of AGID 2014.08.11, this is only "only", which has only the | ||
241 | // form "onliest" | ||
242 | std::cout << " Ignoring adjective/adverb \"" << infinitive | ||
243 | << "\" due to non-standard number of forms." << std::endl; | ||
244 | } | ||
245 | |||
246 | break; | ||
247 | } | ||
248 | |||
249 | case 'N': { | ||
250 | if (agidForms.size() == 1) { | ||
251 | inflections.push_back(agidForms[0]); | ||
252 | } else { | ||
253 | // As of AGID 2014.08.11, this is non-existent. | ||
254 | std::cout << " Ignoring noun \"" << infinitive | ||
255 | << "\" due to non-standard number of forms." << std::endl; | ||
256 | } | ||
257 | |||
258 | break; | ||
259 | } | ||
260 | } | ||
261 | |||
262 | // Compile the forms we have mapped. | ||
263 | for (const std::list<std::string>& infl_list : inflections) { | ||
264 | for (const std::string& infl : infl_list) { | ||
265 | size_t form_id = LookupOrCreateForm(infl); | ||
266 | AddFormToWord(form_id, word_id); | ||
267 | } | ||
268 | } | ||
269 | } | ||
270 | } | ||
271 | |||
272 | word_frequencies.clear(); // Not needed anymore. | ||
273 | |||
274 | { | ||
275 | std::list<std::string> lines(readFile(cmudictPath_)); | ||
276 | |||
277 | hatkirby::progress ppgs("Reading pronunciations from CMUDICT...", | ||
278 | lines.size()); | ||
279 | |||
280 | for (std::string line : lines) { | ||
281 | ppgs.update(); | ||
282 | |||
283 | std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))? ([A-Z 0-9]+)"); | ||
284 | std::smatch phoneme_data; | ||
285 | if (std::regex_search(line, phoneme_data, phoneme)) { | ||
286 | std::string canonical = hatkirby::lowercase(phoneme_data[1]); | ||
287 | |||
288 | if (!form_by_text_.count(canonical)) { | ||
289 | continue; | ||
290 | } | ||
291 | |||
292 | std::string phonemes = phoneme_data[2]; | ||
293 | size_t pronunciation_id = LookupOrCreatePronunciation(phonemes); | ||
294 | AddPronunciationToForm(pronunciation_id, form_by_text_[canonical]); | ||
295 | } | ||
296 | } | ||
297 | } | ||
298 | |||
299 | std::cout << "Words: " << words_.size() << std::endl; | ||
300 | std::cout << "Forms: " << forms_.size() << std::endl; | ||
301 | std::cout << "Pronunciations: " << pronunciations_.size() << std::endl; | ||
302 | |||
303 | // White Top | ||
304 | { | ||
305 | hatkirby::progress ppgs("Generating white top puzzles...", forms_.size()); | ||
306 | |||
307 | for (Form& form : forms_) { | ||
308 | ppgs.update(); | ||
309 | |||
310 | for (size_t p_id : form.pronunciation_ids) { | ||
311 | const Pronunciation& pronunciation = pronunciations_.at(p_id); | ||
312 | for (size_t other_form_id : pronunciation.form_ids) { | ||
313 | if (other_form_id != form.id) { | ||
314 | form.puzzles[kWhiteTop].insert(other_form_id); | ||
315 | } | ||
316 | } | ||
317 | } | ||
318 | } | ||
319 | } | ||
320 | |||
321 | // White Bottom | ||
322 | { | ||
323 | hatkirby::progress ppgs("Generating white bottom puzzles...", | ||
324 | words_.size()); | ||
325 | |||
326 | for (const Word& word : words_) { | ||
327 | ppgs.update(); | ||
328 | |||
329 | Form& form = forms_.at(word.base_form_id); | ||
330 | for (size_t synset_id : word.synsets) { | ||
331 | for (size_t other_word_id : synsets_.at(synset_id)) { | ||
332 | if (other_word_id != word.id) { | ||
333 | const Word& other_word = words_.at(other_word_id); | ||
334 | form.puzzles[kWhiteBottom].insert(other_word.base_form_id); | ||
335 | } | ||
336 | } | ||
337 | } | ||
338 | } | ||
339 | } | ||
340 | |||
341 | // Yellow Top | ||
342 | { | ||
343 | hatkirby::progress ppgs("Generating yellow top puzzles...", | ||
344 | anaphone_sets_.size()); | ||
345 | |||
346 | for (const std::vector<size_t>& anaphone_set : anaphone_sets_) { | ||
347 | ppgs.update(); | ||
348 | |||
349 | std::set<size_t> all_forms; | ||
350 | for (size_t p_id : anaphone_set) { | ||
351 | const Pronunciation& pronunciation = pronunciations_.at(p_id); | ||
352 | for (size_t form_id : pronunciation.form_ids) { | ||
353 | all_forms.insert(form_id); | ||
354 | } | ||
355 | } | ||
356 | |||
357 | for (size_t f_id1 : all_forms) { | ||
358 | for (size_t f_id2 : all_forms) { | ||
359 | if (f_id1 != f_id2) { | ||
360 | Form& form = forms_.at(f_id1); | ||
361 | form.puzzles[kYellowTop].insert(f_id2); | ||
362 | } | ||
363 | } | ||
364 | } | ||
365 | } | ||
366 | } | ||
367 | |||
368 | // Yellow Middle | ||
369 | { | ||
370 | hatkirby::progress ppgs("Generating yellow middle puzzles...", | ||
371 | anagram_sets_.size()); | ||
372 | |||
373 | for (const std::vector<size_t>& anagram_set : anagram_sets_) { | ||
374 | ppgs.update(); | ||
375 | |||
376 | for (size_t f_id1 : anagram_set) { | ||
377 | for (size_t f_id2 : anagram_set) { | ||
378 | if (f_id1 != f_id2) { | ||
379 | Form& form = forms_.at(f_id1); | ||
380 | form.puzzles[kYellowMiddle].insert(f_id2); | ||
381 | } | ||
382 | } | ||
383 | } | ||
384 | } | ||
385 | } | ||
386 | |||
387 | // Black Top | ||
388 | { | ||
389 | hatkirby::progress ppgs("Generating black top puzzles...", | ||
390 | pronunciations_.size()); | ||
391 | |||
392 | for (const Pronunciation& pronunciation : pronunciations_) { | ||
393 | ppgs.update(); | ||
394 | |||
395 | auto reversed_list = hatkirby::split<std::vector<std::string>>( | ||
396 | pronunciation.stressless_phonemes, " "); | ||
397 | std::reverse(reversed_list.begin(), reversed_list.end()); | ||
398 | std::string reversed_phonemes = | ||
399 | hatkirby::implode(reversed_list.begin(), reversed_list.end(), " "); | ||
400 | if (pronunciations_by_blank_phonemes_.count(reversed_phonemes)) { | ||
401 | std::set<size_t> all_forms; | ||
402 | |||
403 | for (size_t p_id : | ||
404 | pronunciations_by_blank_phonemes_.at(reversed_phonemes)) { | ||
405 | const Pronunciation& other_pronunciation = pronunciations_.at(p_id); | ||
406 | for (size_t form_id : other_pronunciation.form_ids) { | ||
407 | all_forms.insert(form_id); | ||
408 | } | ||
409 | } | ||
410 | |||
411 | for (size_t f_id1 : pronunciation.form_ids) { | ||
412 | for (size_t f_id2 : all_forms) { | ||
413 | Form& form = forms_.at(f_id1); | ||
414 | form.puzzles[kBlackTop].insert(f_id2); | ||
415 | } | ||
416 | } | ||
417 | } | ||
418 | } | ||
419 | } | ||
420 | |||
421 | // Black Middle | ||
422 | { | ||
423 | hatkirby::progress ppgs("Generating black middle puzzles...", | ||
424 | forms_.size()); | ||
425 | |||
426 | for (Form& form : forms_) { | ||
427 | ppgs.update(); | ||
428 | |||
429 | std::string reversed_text = form.text; | ||
430 | std::reverse(reversed_text.begin(), reversed_text.end()); | ||
431 | |||
432 | if (form_by_text_.count(reversed_text)) { | ||
433 | form.puzzles[kBlackMiddle].insert(form_by_text_.at(reversed_text)); | ||
434 | } | ||
435 | } | ||
436 | } | ||
437 | |||
438 | // Count up all of the generated puzzles. | ||
439 | int total_puzzles = 0; | ||
440 | int reusable_words = 0; | ||
441 | std::unordered_map<PuzzleType, int> per_puzzle_type; | ||
442 | for (const Form& form : forms_) { | ||
443 | for (const auto& [puzzle_type, puzzles] : form.puzzles) { | ||
444 | total_puzzles += puzzles.size(); | ||
445 | per_puzzle_type[puzzle_type]++; | ||
446 | } | ||
447 | if (form.puzzles.size() > 1) { | ||
448 | reusable_words++; | ||
449 | } | ||
450 | } | ||
451 | std::cout << "Puzzles: " << total_puzzles << std::endl; | ||
452 | std::cout << "Reusable words: " << reusable_words << std::endl; | ||
453 | std::cout << "White tops: " << per_puzzle_type[kWhiteTop] << std::endl; | ||
454 | std::cout << "White bottom: " << per_puzzle_type[kWhiteBottom] << std::endl; | ||
455 | std::cout << "Yellow tops: " << per_puzzle_type[kYellowTop] << std::endl; | ||
456 | std::cout << "Yellow middles: " << per_puzzle_type[kYellowMiddle] | ||
457 | << std::endl; | ||
458 | std::cout << "Black tops: " << per_puzzle_type[kBlackTop] << std::endl; | ||
459 | std::cout << "Black middles: " << per_puzzle_type[kBlackMiddle] << std::endl; | ||
460 | } | ||
461 | |||
462 | size_t generator::LookupOrCreatePronunciation(const std::string& phonemes) { | ||
463 | if (pronunciation_by_phonemes_.count(phonemes)) { | ||
464 | return pronunciation_by_phonemes_[phonemes]; | ||
465 | } else { | ||
466 | size_t pronunciation_id = pronunciations_.size(); | ||
467 | |||
468 | auto phonemeList = hatkirby::split<std::list<std::string>>(phonemes, " "); | ||
469 | |||
470 | std::list<std::string>::iterator rhymeStart = | ||
471 | std::find_if(std::begin(phonemeList), std::end(phonemeList), | ||
472 | [](std::string phoneme) { | ||
473 | return phoneme.find("1") != std::string::npos; | ||
474 | }); | ||
475 | |||
476 | // Rhyme detection | ||
477 | std::string prerhyme = ""; | ||
478 | std::string rhyme = ""; | ||
479 | if (rhymeStart != std::end(phonemeList)) { | ||
480 | std::list<std::string> rhymePhonemes; | ||
481 | |||
482 | std::transform( | ||
483 | rhymeStart, std::end(phonemeList), std::back_inserter(rhymePhonemes), | ||
484 | [](std::string phoneme) { | ||
485 | std::string naked; | ||
486 | |||
487 | std::remove_copy_if(std::begin(phoneme), std::end(phoneme), | ||
488 | std::back_inserter(naked), | ||
489 | [](char ch) { return std::isdigit(ch); }); | ||
490 | |||
491 | return naked; | ||
492 | }); | ||
493 | |||
494 | rhyme = hatkirby::implode(std::begin(rhymePhonemes), | ||
495 | std::end(rhymePhonemes), " "); | ||
496 | |||
497 | if (rhymeStart != std::begin(phonemeList)) { | ||
498 | prerhyme = *std::prev(rhymeStart); | ||
499 | } | ||
500 | |||
501 | pronunciations_by_rhyme_[rhyme].push_back(pronunciation_id); | ||
502 | } | ||
503 | |||
504 | std::string stressless; | ||
505 | for (int i = 0; i < phonemes.size(); i++) { | ||
506 | if (!std::isdigit(phonemes[i])) { | ||
507 | stressless.push_back(phonemes[i]); | ||
508 | } | ||
509 | } | ||
510 | auto stresslessList = | ||
511 | hatkirby::split<std::vector<std::string>>(stressless, " "); | ||
512 | std::string stresslessPhonemes = | ||
513 | hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
514 | std::sort(stresslessList.begin(), stresslessList.end()); | ||
515 | std::string sortedPhonemes = | ||
516 | hatkirby::implode(stresslessList.begin(), stresslessList.end(), " "); | ||
517 | |||
518 | pronunciations_.push_back({.id = pronunciation_id, | ||
519 | .phonemes = phonemes, | ||
520 | .prerhyme = prerhyme, | ||
521 | .rhyme = rhyme, | ||
522 | .stressless_phonemes = stresslessPhonemes}); | ||
523 | |||
524 | AddPronunciationToAnaphoneSet(pronunciation_id, sortedPhonemes); | ||
525 | |||
526 | pronunciation_by_phonemes_[phonemes] = pronunciation_id; | ||
527 | pronunciations_by_blank_phonemes_[stresslessPhonemes].push_back( | ||
528 | pronunciation_id); | ||
529 | |||
530 | return pronunciation_id; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | size_t generator::LookupOrCreateForm(const std::string& word) { | ||
535 | if (form_by_text_.count(word)) { | ||
536 | return form_by_text_[word]; | ||
537 | } else { | ||
538 | size_t form_id = forms_.size(); | ||
539 | form_by_text_[word] = form_id; | ||
540 | forms_.push_back({.id = form_id, .text = word}); | ||
541 | |||
542 | std::string sortedText = word; | ||
543 | std::sort(sortedText.begin(), sortedText.end()); | ||
544 | AddFormToAnagramSet(form_id, sortedText); | ||
545 | |||
546 | return form_id; | ||
547 | } | ||
548 | } | ||
549 | |||
550 | size_t generator::LookupOrCreateWord(const std::string& word) { | ||
551 | if (word_by_base_.count(word)) { | ||
552 | return word_by_base_[word]; | ||
553 | } else { | ||
554 | size_t word_id = words_.size(); | ||
555 | word_by_base_[word] = words_.size(); | ||
556 | size_t form_id = LookupOrCreateForm(word); | ||
557 | words_.push_back({.id = word_id, .base_form_id = form_id}); | ||
558 | AddFormToWord(form_id, word_id); | ||
559 | return word_id; | ||
560 | } | ||
561 | } | ||
562 | |||
563 | void generator::AddPronunciationToForm(size_t pronunciation_id, | ||
564 | size_t form_id) { | ||
565 | pronunciations_[pronunciation_id].form_ids.push_back(form_id); | ||
566 | forms_[form_id].pronunciation_ids.push_back(pronunciation_id); | ||
567 | } | ||
568 | |||
569 | void generator::AddFormToWord(size_t form_id, size_t word_id) { | ||
570 | words_[word_id].form_ids.push_back(form_id); | ||
571 | forms_[form_id].word_ids.push_back(word_id); | ||
572 | } | ||
573 | |||
574 | void generator::AddWordToSynset(size_t word_id, int wnid) { | ||
575 | if (!synset_by_wnid_.count(wnid)) { | ||
576 | synset_by_wnid_[wnid] = synsets_.size(); | ||
577 | synsets_.push_back({word_id}); | ||
578 | words_[word_id].synsets.push_back(synsets_.size() - 1); | ||
579 | } else { | ||
580 | size_t synset_id = synset_by_wnid_[wnid]; | ||
581 | synsets_[synset_id].push_back(word_id); | ||
582 | words_[word_id].synsets.push_back(synset_id); | ||
583 | } | ||
584 | } | ||
585 | |||
586 | void generator::AddFormToAnagramSet(size_t form_id, | ||
587 | const std::string& sorted_letters) { | ||
588 | if (!anagram_set_by_sorted_letters_.count(sorted_letters)) { | ||
589 | anagram_set_by_sorted_letters_[sorted_letters] = anagram_sets_.size(); | ||
590 | anagram_sets_.push_back({form_id}); | ||
591 | forms_[form_id].anagram_set_id = anagram_sets_.size() - 1; | ||
592 | } else { | ||
593 | size_t anagram_set_id = anagram_set_by_sorted_letters_[sorted_letters]; | ||
594 | anagram_sets_[anagram_set_id].push_back(form_id); | ||
595 | forms_[form_id].anagram_set_id = anagram_set_id; | ||
596 | } | ||
597 | } | ||
598 | |||
599 | void generator::AddPronunciationToAnaphoneSet( | ||
600 | size_t pronunciation_id, const std::string& sorted_phonemes) { | ||
601 | if (!anaphone_set_by_sorted_phonemes_.count(sorted_phonemes)) { | ||
602 | anaphone_set_by_sorted_phonemes_[sorted_phonemes] = anaphone_sets_.size(); | ||
603 | anaphone_sets_.push_back({pronunciation_id}); | ||
604 | pronunciations_[pronunciation_id].anaphone_set_id = | ||
605 | anaphone_sets_.size() - 1; | ||
606 | } else { | ||
607 | size_t anaphone_set_id = anaphone_set_by_sorted_phonemes_[sorted_phonemes]; | ||
608 | anaphone_sets_[anaphone_set_id].push_back(pronunciation_id); | ||
609 | pronunciations_[pronunciation_id].anaphone_set_id = anaphone_set_id; | ||
610 | } | ||
611 | } | ||
diff --git a/generator/generator.h b/generator/generator.h new file mode 100644 index 0000000..a97b0b0 --- /dev/null +++ b/generator/generator.h | |||
@@ -0,0 +1,114 @@ | |||
1 | #ifndef GENERATOR_H_D5C6A724 | ||
2 | #define GENERATOR_H_D5C6A724 | ||
3 | |||
4 | #include <optional> | ||
5 | #include <set> | ||
6 | #include <string> | ||
7 | #include <unordered_map> | ||
8 | #include <vector> | ||
9 | |||
10 | enum PuzzleType { | ||
11 | kWhiteTop = 0, | ||
12 | kWhiteBottom = 1, | ||
13 | kYellowTop = 2, | ||
14 | kYellowMiddle = 3, | ||
15 | kBlackTop = 4, | ||
16 | kBlackMiddle = 5, | ||
17 | }; | ||
18 | |||
19 | class generator { | ||
20 | public: | ||
21 | // Constructor | ||
22 | |||
23 | generator(std::string agidPath, std::string wordNetPath, | ||
24 | std::string cmudictPath, std::string wordfreqPath, | ||
25 | std::string outputPath); | ||
26 | |||
27 | // Action | ||
28 | |||
29 | void run(); | ||
30 | |||
31 | private: | ||
32 | // Helpers | ||
33 | |||
34 | size_t LookupOrCreatePronunciation(const std::string& phonemes); | ||
35 | |||
36 | size_t LookupOrCreateForm(const std::string& form); | ||
37 | |||
38 | size_t LookupOrCreateWord(const std::string& word); | ||
39 | |||
40 | void AddPronunciationToForm(size_t pronunciation_id, size_t form_id); | ||
41 | |||
42 | void AddFormToWord(size_t form_id, size_t word_id); | ||
43 | |||
44 | void AddWordToSynset(size_t word_id, int wnid); | ||
45 | |||
46 | void AddFormToAnagramSet(size_t form_id, const std::string& sorted_letters); | ||
47 | |||
48 | void AddPronunciationToAnaphoneSet(size_t pronunciation_id, | ||
49 | const std::string& sorted_phonemes); | ||
50 | |||
51 | // Input | ||
52 | |||
53 | std::string agidPath_; | ||
54 | std::string wordNetPath_; | ||
55 | std::string cmudictPath_; | ||
56 | std::string wordfreqPath_; | ||
57 | |||
58 | // Output | ||
59 | |||
60 | std::string outputPath_; | ||
61 | |||
62 | // Indexes | ||
63 | |||
64 | struct Pronunciation { | ||
65 | size_t id; | ||
66 | std::string phonemes; | ||
67 | std::string prerhyme; | ||
68 | std::string rhyme; | ||
69 | std::vector<size_t> form_ids; | ||
70 | std::optional<size_t> anaphone_set_id; | ||
71 | std::string stressless_phonemes; | ||
72 | }; | ||
73 | |||
74 | struct Form { | ||
75 | size_t id; | ||
76 | std::string text; | ||
77 | std::vector<size_t> word_ids; | ||
78 | std::vector<size_t> pronunciation_ids; | ||
79 | std::optional<size_t> anagram_set_id; | ||
80 | std::optional<size_t> reverse_form_id; | ||
81 | |||
82 | std::unordered_map<PuzzleType, std::set<size_t>> puzzles; | ||
83 | }; | ||
84 | |||
85 | struct Word { | ||
86 | size_t id; | ||
87 | size_t base_form_id; | ||
88 | std::vector<size_t> form_ids; | ||
89 | std::vector<size_t> synsets; | ||
90 | }; | ||
91 | |||
92 | std::vector<Pronunciation> pronunciations_; | ||
93 | std::unordered_map<std::string, size_t> pronunciation_by_phonemes_; | ||
94 | std::unordered_map<std::string, std::vector<size_t>> pronunciations_by_rhyme_; | ||
95 | std::unordered_map<std::string, std::vector<size_t>> | ||
96 | pronunciations_by_blank_phonemes_; | ||
97 | |||
98 | std::vector<std::vector<size_t>> anaphone_sets_; | ||
99 | std::unordered_map<std::string, size_t> anaphone_set_by_sorted_phonemes_; | ||
100 | |||
101 | std::vector<Form> forms_; | ||
102 | std::unordered_map<std::string, size_t> form_by_text_; | ||
103 | |||
104 | std::vector<std::vector<size_t>> anagram_sets_; | ||
105 | std::unordered_map<std::string, size_t> anagram_set_by_sorted_letters_; | ||
106 | |||
107 | std::vector<Word> words_; | ||
108 | std::unordered_map<std::string, size_t> word_by_base_; | ||
109 | |||
110 | std::vector<std::vector<size_t>> synsets_; | ||
111 | std::unordered_map<int, size_t> synset_by_wnid_; | ||
112 | }; | ||
113 | |||
114 | #endif /* end of include guard: GENERATOR_H_D5C6A724 */ \ No newline at end of file | ||
diff --git a/generator/main.cpp b/generator/main.cpp new file mode 100644 index 0000000..c958421 --- /dev/null +++ b/generator/main.cpp | |||
@@ -0,0 +1,35 @@ | |||
1 | #include <exception> | ||
2 | #include <iostream> | ||
3 | |||
4 | #include "generator.h" | ||
5 | |||
6 | void printUsage() { | ||
7 | std::cout << "usage: generator agid wordnet cmudict wordfreq output" | ||
8 | << std::endl; | ||
9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | ||
10 | std::cout << "wordnet :: path to a WordNet prolog data directory" | ||
11 | << std::endl; | ||
12 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | ||
13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; | ||
14 | std::cout << "output :: datafile output path" << std::endl; | ||
15 | } | ||
16 | |||
17 | int main(int argc, char** argv) { | ||
18 | if (argc == 6) { | ||
19 | try { | ||
20 | generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); | ||
21 | |||
22 | try { | ||
23 | app.run(); | ||
24 | } catch (const std::exception& e) { | ||
25 | std::cout << e.what() << std::endl; | ||
26 | } | ||
27 | } catch (const std::exception& e) { | ||
28 | std::cout << e.what() << std::endl; | ||
29 | printUsage(); | ||
30 | } | ||
31 | } else { | ||
32 | std::cout << "lingo randomizer generator" << std::endl; | ||
33 | printUsage(); | ||
34 | } | ||
35 | } | ||