diff options
author | Star Rauchenberger <fefferburbia@gmail.com> | 2023-12-02 17:10:48 -0500 |
---|---|---|
committer | Star Rauchenberger <fefferburbia@gmail.com> | 2023-12-02 17:10:48 -0500 |
commit | 17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 (patch) | |
tree | 64604b3f8ef46e3f439229c80837e60768933c06 /generator | |
parent | 90fbd47ca02f1a723134302ca978a7f9ef0eac04 (diff) | |
download | lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.gz lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.bz2 lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.zip |
Filter out profane words
Diffstat (limited to 'generator')
-rw-r--r-- | generator/generator.cpp | 27 | ||||
-rw-r--r-- | generator/generator.h | 4 | ||||
-rw-r--r-- | generator/main.cpp | 7 |
3 files changed, 30 insertions, 8 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <hkutil/string.h> | 4 | #include <hkutil/string.h> |
5 | 5 | ||
6 | #include <algorithm> | 6 | #include <algorithm> |
7 | #include <filesystem> | ||
7 | #include <fstream> | 8 | #include <fstream> |
8 | #include <list> | 9 | #include <list> |
9 | #include <regex> | 10 | #include <regex> |
@@ -11,6 +12,7 @@ | |||
11 | #include <stdexcept> | 12 | #include <stdexcept> |
12 | #include <string> | 13 | #include <string> |
13 | #include <unordered_map> | 14 | #include <unordered_map> |
15 | #include <unordered_set> | ||
14 | #include <vector> | 16 | #include <vector> |
15 | 17 | ||
16 | constexpr int MIN_FREQUENCY = 2000000; | 18 | constexpr int MIN_FREQUENCY = 2000000; |
@@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) { | |||
49 | 51 | ||
50 | generator::generator(std::string agidPath, std::string wordNetPath, | 52 | generator::generator(std::string agidPath, std::string wordNetPath, |
51 | std::string cmudictPath, std::string wordfreqPath, | 53 | std::string cmudictPath, std::string wordfreqPath, |
52 | std::string outputPath) | 54 | std::string datadirPath, std::string outputPath) |
53 | : agidPath_(agidPath), | 55 | : agidPath_(agidPath), |
54 | wordNetPath_(wordNetPath), | 56 | wordNetPath_(wordNetPath), |
55 | cmudictPath_(cmudictPath), | 57 | cmudictPath_(cmudictPath), |
56 | wordfreqPath_(wordfreqPath), | 58 | wordfreqPath_(wordfreqPath), |
59 | datadirPath_(datadirPath), | ||
57 | outputPath_(outputPath) { | 60 | outputPath_(outputPath) { |
58 | // Ensure AGID infl.txt exists | 61 | // Ensure AGID infl.txt exists |
59 | if (!std::ifstream(agidPath_)) { | 62 | if (!std::ifstream(agidPath_)) { |
@@ -102,6 +105,14 @@ void generator::run() { | |||
102 | } | 105 | } |
103 | } | 106 | } |
104 | 107 | ||
108 | std::unordered_set<std::string> profane; | ||
109 | { | ||
110 | std::list<std::string> lines(readFile(datadirPath_ / "profane.txt")); | ||
111 | for (const std::string& line : lines) { | ||
112 | profane.insert(line); | ||
113 | } | ||
114 | } | ||
115 | |||
105 | { | 116 | { |
106 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | 117 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
107 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | 118 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); |
@@ -142,6 +153,11 @@ void generator::run() { | |||
142 | continue; | 153 | continue; |
143 | } | 154 | } |
144 | 155 | ||
156 | // Ignore any profane words. | ||
157 | if (profane.count(text)) { | ||
158 | continue; | ||
159 | } | ||
160 | |||
145 | // The WordNet data does contain duplicates, so we need to check that we | 161 | // The WordNet data does contain duplicates, so we need to check that we |
146 | // haven't already created this word. | 162 | // haven't already created this word. |
147 | std::pair<int, int> lookup(synset_id, wnum); | 163 | std::pair<int, int> lookup(synset_id, wnum); |
@@ -175,7 +191,8 @@ void generator::run() { | |||
175 | } | 191 | } |
176 | 192 | ||
177 | if (!word_by_base_.count(infinitive) && | 193 | if (!word_by_base_.count(infinitive) && |
178 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { | 194 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && |
195 | !profane.count(infinitive))) { | ||
179 | continue; | 196 | continue; |
180 | } | 197 | } |
181 | 198 | ||
@@ -262,8 +279,10 @@ void generator::run() { | |||
262 | // Compile the forms we have mapped. | 279 | // Compile the forms we have mapped. |
263 | for (const std::list<std::string>& infl_list : inflections) { | 280 | for (const std::list<std::string>& infl_list : inflections) { |
264 | for (const std::string& infl : infl_list) { | 281 | for (const std::string& infl : infl_list) { |
265 | size_t form_id = LookupOrCreateForm(infl); | 282 | if (!profane.count(infl)) { |
266 | AddFormToWord(form_id, word_id); | 283 | size_t form_id = LookupOrCreateForm(infl); |
284 | AddFormToWord(form_id, word_id); | ||
285 | } | ||
267 | } | 286 | } |
268 | } | 287 | } |
269 | } | 288 | } |
diff --git a/generator/generator.h b/generator/generator.h index a97b0b0..923fc17 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
@@ -1,6 +1,7 @@ | |||
1 | #ifndef GENERATOR_H_D5C6A724 | 1 | #ifndef GENERATOR_H_D5C6A724 |
2 | #define GENERATOR_H_D5C6A724 | 2 | #define GENERATOR_H_D5C6A724 |
3 | 3 | ||
4 | #include <filesystem> | ||
4 | #include <optional> | 5 | #include <optional> |
5 | #include <set> | 6 | #include <set> |
6 | #include <string> | 7 | #include <string> |
@@ -22,7 +23,7 @@ class generator { | |||
22 | 23 | ||
23 | generator(std::string agidPath, std::string wordNetPath, | 24 | generator(std::string agidPath, std::string wordNetPath, |
24 | std::string cmudictPath, std::string wordfreqPath, | 25 | std::string cmudictPath, std::string wordfreqPath, |
25 | std::string outputPath); | 26 | std::string datadirPath, std::string outputPath); |
26 | 27 | ||
27 | // Action | 28 | // Action |
28 | 29 | ||
@@ -54,6 +55,7 @@ class generator { | |||
54 | std::string wordNetPath_; | 55 | std::string wordNetPath_; |
55 | std::string cmudictPath_; | 56 | std::string cmudictPath_; |
56 | std::string wordfreqPath_; | 57 | std::string wordfreqPath_; |
58 | std::filesystem::path datadirPath_; | ||
57 | 59 | ||
58 | // Output | 60 | // Output |
59 | 61 | ||
diff --git a/generator/main.cpp b/generator/main.cpp index c958421..94bf0a1 100644 --- a/generator/main.cpp +++ b/generator/main.cpp | |||
@@ -4,20 +4,21 @@ | |||
4 | #include "generator.h" | 4 | #include "generator.h" |
5 | 5 | ||
6 | void printUsage() { | 6 | void printUsage() { |
7 | std::cout << "usage: generator agid wordnet cmudict wordfreq output" | 7 | std::cout << "usage: generator agid wordnet cmudict wordfreq datadir output" |
8 | << std::endl; | 8 | << std::endl; |
9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; |
10 | std::cout << "wordnet :: path to a WordNet prolog data directory" | 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" |
11 | << std::endl; | 11 | << std::endl; |
12 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | 12 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; |
13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; | 13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; |
14 | std::cout << "datadir :: path to the Lingo Randomizer datadir" << std::endl; | ||
14 | std::cout << "output :: datafile output path" << std::endl; | 15 | std::cout << "output :: datafile output path" << std::endl; |
15 | } | 16 | } |
16 | 17 | ||
17 | int main(int argc, char** argv) { | 18 | int main(int argc, char** argv) { |
18 | if (argc == 6) { | 19 | if (argc == 7) { |
19 | try { | 20 | try { |
20 | generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); | 21 | generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); |
21 | 22 | ||
22 | try { | 23 | try { |
23 | app.run(); | 24 | app.run(); |