From 17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Sat, 2 Dec 2023 17:10:48 -0500 Subject: Filter out profane words --- generator/generator.cpp | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) (limited to 'generator/generator.cpp') diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -11,6 +12,7 @@ #include #include #include +#include #include constexpr int MIN_FREQUENCY = 2000000; @@ -49,11 +51,12 @@ std::list readFile(std::string path, bool uniq = false) { generator::generator(std::string agidPath, std::string wordNetPath, std::string cmudictPath, std::string wordfreqPath, - std::string outputPath) + std::string datadirPath, std::string outputPath) : agidPath_(agidPath), wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), wordfreqPath_(wordfreqPath), + datadirPath_(datadirPath), outputPath_(outputPath) { // Ensure AGID infl.txt exists if (!std::ifstream(agidPath_)) { @@ -102,6 +105,14 @@ void generator::run() { } } + std::unordered_set profane; + { + std::list lines(readFile(datadirPath_ / "profane.txt")); + for (const std::string& line : lines) { + profane.insert(line); + } + } + { std::list lines(readFile(wordNetPath_ + "wn_s.pl")); hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); @@ -142,6 +153,11 @@ void generator::run() { continue; } + // Ignore any profane words. + if (profane.count(text)) { + continue; + } + // The WordNet data does contain duplicates, so we need to check that we // haven't already created this word. std::pair lookup(synset_id, wnum); @@ -175,7 +191,8 @@ void generator::run() { } if (!word_by_base_.count(infinitive) && - !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { + !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && + !profane.count(infinitive))) { continue; } @@ -262,8 +279,10 @@ void generator::run() { // Compile the forms we have mapped. for (const std::list& infl_list : inflections) { for (const std::string& infl : infl_list) { - size_t form_id = LookupOrCreateForm(infl); - AddFormToWord(form_id, word_id); + if (!profane.count(infl)) { + size_t form_id = LookupOrCreateForm(infl); + AddFormToWord(form_id, word_id); + } } } } -- cgit 1.4.1