summary refs log tree commit diff stats
path: root/generator
diff options
context:
space:
mode:
authorStar Rauchenberger <fefferburbia@gmail.com>2023-12-02 17:10:48 -0500
committerStar Rauchenberger <fefferburbia@gmail.com>2023-12-02 17:10:48 -0500
commit17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 (patch)
tree64604b3f8ef46e3f439229c80837e60768933c06 /generator
parent90fbd47ca02f1a723134302ca978a7f9ef0eac04 (diff)
downloadlingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.gz
lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.bz2
lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.zip
Filter out profane words
Diffstat (limited to 'generator')
-rw-r--r--generator/generator.cpp27
-rw-r--r--generator/generator.h4
-rw-r--r--generator/main.cpp7
3 files changed, 30 insertions, 8 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -4,6 +4,7 @@
4#include <hkutil/string.h> 4#include <hkutil/string.h>
5 5
6#include <algorithm> 6#include <algorithm>
7#include <filesystem>
7#include <fstream> 8#include <fstream>
8#include <list> 9#include <list>
9#include <regex> 10#include <regex>
@@ -11,6 +12,7 @@
11#include <stdexcept> 12#include <stdexcept>
12#include <string> 13#include <string>
13#include <unordered_map> 14#include <unordered_map>
15#include <unordered_set>
14#include <vector> 16#include <vector>
15 17
16constexpr int MIN_FREQUENCY = 2000000; 18constexpr int MIN_FREQUENCY = 2000000;
@@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) {
49 51
50generator::generator(std::string agidPath, std::string wordNetPath, 52generator::generator(std::string agidPath, std::string wordNetPath,
51 std::string cmudictPath, std::string wordfreqPath, 53 std::string cmudictPath, std::string wordfreqPath,
52 std::string outputPath) 54 std::string datadirPath, std::string outputPath)
53 : agidPath_(agidPath), 55 : agidPath_(agidPath),
54 wordNetPath_(wordNetPath), 56 wordNetPath_(wordNetPath),
55 cmudictPath_(cmudictPath), 57 cmudictPath_(cmudictPath),
56 wordfreqPath_(wordfreqPath), 58 wordfreqPath_(wordfreqPath),
59 datadirPath_(datadirPath),
57 outputPath_(outputPath) { 60 outputPath_(outputPath) {
58 // Ensure AGID infl.txt exists 61 // Ensure AGID infl.txt exists
59 if (!std::ifstream(agidPath_)) { 62 if (!std::ifstream(agidPath_)) {
@@ -102,6 +105,14 @@ void generator::run() {
102 } 105 }
103 } 106 }
104 107
108 std::unordered_set<std::string> profane;
109 {
110 std::list<std::string> lines(readFile(datadirPath_ / "profane.txt"));
111 for (const std::string& line : lines) {
112 profane.insert(line);
113 }
114 }
115
105 { 116 {
106 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); 117 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
107 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); 118 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
@@ -142,6 +153,11 @@ void generator::run() {
142 continue; 153 continue;
143 } 154 }
144 155
156 // Ignore any profane words.
157 if (profane.count(text)) {
158 continue;
159 }
160
145 // The WordNet data does contain duplicates, so we need to check that we 161 // The WordNet data does contain duplicates, so we need to check that we
146 // haven't already created this word. 162 // haven't already created this word.
147 std::pair<int, int> lookup(synset_id, wnum); 163 std::pair<int, int> lookup(synset_id, wnum);
@@ -175,7 +191,8 @@ void generator::run() {
175 } 191 }
176 192
177 if (!word_by_base_.count(infinitive) && 193 if (!word_by_base_.count(infinitive) &&
178 !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { 194 !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY &&
195 !profane.count(infinitive))) {
179 continue; 196 continue;
180 } 197 }
181 198
@@ -262,8 +279,10 @@ void generator::run() {
262 // Compile the forms we have mapped. 279 // Compile the forms we have mapped.
263 for (const std::list<std::string>& infl_list : inflections) { 280 for (const std::list<std::string>& infl_list : inflections) {
264 for (const std::string& infl : infl_list) { 281 for (const std::string& infl : infl_list) {
265 size_t form_id = LookupOrCreateForm(infl); 282 if (!profane.count(infl)) {
266 AddFormToWord(form_id, word_id); 283 size_t form_id = LookupOrCreateForm(infl);
284 AddFormToWord(form_id, word_id);
285 }
267 } 286 }
268 } 287 }
269 } 288 }
diff --git a/generator/generator.h b/generator/generator.h index a97b0b0..923fc17 100644 --- a/generator/generator.h +++ b/generator/generator.h
@@ -1,6 +1,7 @@
1#ifndef GENERATOR_H_D5C6A724 1#ifndef GENERATOR_H_D5C6A724
2#define GENERATOR_H_D5C6A724 2#define GENERATOR_H_D5C6A724
3 3
4#include <filesystem>
4#include <optional> 5#include <optional>
5#include <set> 6#include <set>
6#include <string> 7#include <string>
@@ -22,7 +23,7 @@ class generator {
22 23
23 generator(std::string agidPath, std::string wordNetPath, 24 generator(std::string agidPath, std::string wordNetPath,
24 std::string cmudictPath, std::string wordfreqPath, 25 std::string cmudictPath, std::string wordfreqPath,
25 std::string outputPath); 26 std::string datadirPath, std::string outputPath);
26 27
27 // Action 28 // Action
28 29
@@ -54,6 +55,7 @@ class generator {
54 std::string wordNetPath_; 55 std::string wordNetPath_;
55 std::string cmudictPath_; 56 std::string cmudictPath_;
56 std::string wordfreqPath_; 57 std::string wordfreqPath_;
58 std::filesystem::path datadirPath_;
57 59
58 // Output 60 // Output
59 61
diff --git a/generator/main.cpp b/generator/main.cpp index c958421..94bf0a1 100644 --- a/generator/main.cpp +++ b/generator/main.cpp
@@ -4,20 +4,21 @@
4#include "generator.h" 4#include "generator.h"
5 5
6void printUsage() { 6void printUsage() {
7 std::cout << "usage: generator agid wordnet cmudict wordfreq output" 7 std::cout << "usage: generator agid wordnet cmudict wordfreq datadir output"
8 << std::endl; 8 << std::endl;
9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl; 9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl;
10 std::cout << "wordnet :: path to a WordNet prolog data directory" 10 std::cout << "wordnet :: path to a WordNet prolog data directory"
11 << std::endl; 11 << std::endl;
12 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; 12 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl;
13 std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; 13 std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl;
14 std::cout << "datadir :: path to the Lingo Randomizer datadir" << std::endl;
14 std::cout << "output :: datafile output path" << std::endl; 15 std::cout << "output :: datafile output path" << std::endl;
15} 16}
16 17
17int main(int argc, char** argv) { 18int main(int argc, char** argv) {
18 if (argc == 6) { 19 if (argc == 7) {
19 try { 20 try {
20 generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); 21 generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
21 22
22 try { 23 try {
23 app.run(); 24 app.run();