From 17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Sat, 2 Dec 2023 17:10:48 -0500 Subject: Filter out profane words --- generator/generator.cpp | 27 +++++++++++++++++++++++---- generator/generator.h | 4 +++- generator/main.cpp | 7 ++++--- 3 files changed, 30 insertions(+), 8 deletions(-) (limited to 'generator') diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include constexpr int MIN_FREQUENCY = 2000000; @@ -49,11 +51,12 @@ std::list readFile(std::string path, bool uniq = false) { generator::generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, - std::string outputPath) + std::string datadirPath, std::string outputPath) : agidPath_(agidPath), wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), wordfreqPath_(wordfreqPath), + datadirPath_(datadirPath), outputPath_(outputPath) { // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { @@ -102,6 +105,14 @@ void generator::run() { } } + std::unordered_set profane; + { + std::list lines(readFile(datadirPath_ / "profane.txt")); + for (const std::string& line : lines) { + profane.insert(line); + } + } + { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); @@ -142,6 +153,11 @@ void generator::run() { continue; } + // Ignore any profane words. + if (profane.count(text)) { + continue; + } + // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); @@ -175,7 +191,8 @@ void generator::run() { } if (!word_by_base_.count(infinitive) && - !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { + !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && + !profane.count(infinitive))) { continue; } @@ -262,8 +279,10 @@ void generator::run() { // Compile the forms we have mapped. for (const std::list& infl_list : inflections) { for (const std::string& infl : infl_list) { - size_t form_id = LookupOrCreateForm(infl); - AddFormToWord(form_id, word_id); + if (!profane.count(infl)) { + size_t form_id = LookupOrCreateForm(infl); + AddFormToWord(form_id, word_id); + } } } } diff --git a/generator/generator.h b/generator/generator.h index a97b0b0..923fc17 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -1,6 +1,7 @@ #ifndef GENERATOR_H_D5C6A724 #define GENERATOR_H_D5C6A724 +#include #include #include #include @@ -22,7 +23,7 @@ class generator { generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, - std::string outputPath); + std::string datadirPath, std::string outputPath); // Action @@ -54,6 +55,7 @@ class generator { std::string wordNetPath_; std::string cmudictPath_; std::string wordfreqPath_; + std::filesystem::path datadirPath_; // Output diff --git a/generator/main.cpp b/generator/main.cpp index c958421..94bf0a1 100644 --- a/generator/main.cpp +++ b/generator/main.cpp @@ -4,20 +4,21 @@ #include "generator.h" void printUsage() { - std::cout << "usage: generator agid wordnet cmudict wordfreq output" + std::cout << "usage: generator agid wordnet cmudict wordfreq datadir output" << std::endl; std::cout << "agid :: path to an AGID infl.txt file" << std::endl; std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; + std::cout << "datadir :: path to the Lingo Randomizer datadir" << std::endl; std::cout << "output :: datafile output path" << std::endl; } int main(int argc, char** argv) { - if (argc == 6) { + if (argc == 7) { try { - generator app(argv[1], argv[2], argv[3], argv[4], argv[5]); + generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]); try { app.run(); -- cgit 1.4.1