Filter out profane words

author: Star Rauchenberger <fefferburbia@gmail.com> 2023-12-02 17:10:48 -0500
committer: Star Rauchenberger <fefferburbia@gmail.com> 2023-12-02 17:10:48 -0500
commit: 17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 (patch)
tree: 64604b3f8ef46e3f439229c80837e60768933c06 /generator
parent: 90fbd47ca02f1a723134302ca978a7f9ef0eac04 (diff)
download: lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.gz
lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.bz2
lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.zip
3 files changed, 30 insertions, 8 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp
index 7ab69b5..0309482 100644
--- a/generator/generator.cpp
+++ b/generator/generator.cpp

@@ -4,6 +4,7 @@
 #include <hkutil/string.h>
 #include <algorithm>
+#include <filesystem>
 #include <fstream>
 #include <list>
 #include <regex>
@@ -11,6 +12,7 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 constexpr int MIN_FREQUENCY = 2000000;
@@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) {
 generator::generator(std::string agidPath, std::string wordNetPath,
                     std::string cmudictPath, std::string wordfreqPath,
-                     std::string outputPath)
+                     std::string datadirPath, std::string outputPath)
    : agidPath_(agidPath),
      wordNetPath_(wordNetPath),
      cmudictPath_(cmudictPath),
      wordfreqPath_(wordfreqPath),
+      datadirPath_(datadirPath),
      outputPath_(outputPath) {
  // Ensure AGID infl.txt exists
  if (!std::ifstream(agidPath_)) {
@@ -102,6 +105,14 @@ void generator::run() {
    }
  }
+  std::unordered_set<std::string> profane;
+  {
+    std::list<std::string> lines(readFile(datadirPath_ / "profane.txt"));
+    for (const std::string& line : lines) {
+      profane.insert(line);
+    }
+  }
  {
    std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
    hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
@@ -142,6 +153,11 @@ void generator::run() {
        continue;
      }
+      // Ignore any profane words.
+      if (profane.count(text)) {
+        continue;
+      }
      // The WordNet data does contain duplicates, so we need to check that we
      // haven't already created this word.
      std::pair<int, int> lookup(synset_id, wnum);
@@ -175,7 +191,8 @@ void generator::run() {
      }
      if (!word_by_base_.count(infinitive) &&
-          !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) {
+          !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY &&
+            !profane.count(infinitive))) {
        continue;
      }
@@ -262,8 +279,10 @@ void generator::run() {
      // Compile the forms we have mapped.
      for (const std::list<std::string>& infl_list : inflections) {
        for (const std::string& infl : infl_list) {
-          size_t form_id = LookupOrCreateForm(infl);
+          if (!profane.count(infl)) {
-          AddFormToWord(form_id, word_id);
+            size_t form_id = LookupOrCreateForm(infl);
+            AddFormToWord(form_id, word_id);
+          }
        }
      }
    }
diff --git a/generator/generator.h b/generator/generator.h
index a97b0b0..923fc17 100644
--- a/generator/generator.h
+++ b/generator/generator.h

@@ -1,6 +1,7 @@
 #ifndef GENERATOR_H_D5C6A724
 #define GENERATOR_H_D5C6A724
+#include <filesystem>
 #include <optional>
 #include <set>
 #include <string>
@@ -22,7 +23,7 @@ class generator {
  generator(std::string agidPath, std::string wordNetPath,
            std::string cmudictPath, std::string wordfreqPath,
-            std::string outputPath);
+            std::string datadirPath, std::string outputPath);
  // Action
@@ -54,6 +55,7 @@ class generator {
  std::string wordNetPath_;
  std::string cmudictPath_;
  std::string wordfreqPath_;
+  std::filesystem::path datadirPath_;
  // Output
diff --git a/generator/main.cpp b/generator/main.cpp
index c958421..94bf0a1 100644
--- a/generator/main.cpp
+++ b/generator/main.cpp

@@ -4,20 +4,21 @@
 #include "generator.h"
 void printUsage() {
-  std::cout << "usage: generator agid wordnet cmudict wordfreq output"
+  std::cout << "usage: generator agid wordnet cmudict wordfreq datadir output"
            << std::endl;
  std::cout << "agid     :: path to an AGID infl.txt file" << std::endl;
  std::cout << "wordnet  :: path to a WordNet prolog data directory"
            << std::endl;
  std::cout << "cmudict  :: path to a CMUDICT pronunciation file" << std::endl;
  std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl;
+  std::cout << "datadir  :: path to the Lingo Randomizer datadir" << std::endl;
  std::cout << "output   :: datafile output path" << std::endl;
 }
 int main(int argc, char** argv) {
-  if (argc == 6) {
+  if (argc == 7) {
    try {
-      generator app(argv[1], argv[2], argv[3], argv[4], argv[5]);
+      generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
      try {
        app.run();
author	Star Rauchenberger <fefferburbia@gmail.com>	2023-12-02 17:10:48 -0500
committer	Star Rauchenberger <fefferburbia@gmail.com>	2023-12-02 17:10:48 -0500
commit	17778ac3ab8598eb3d43f562a092b9aa7c0a1a42 (patch)
tree	64604b3f8ef46e3f439229c80837e60768933c06 /generator
parent	90fbd47ca02f1a723134302ca978a7f9ef0eac04 (diff)
download	lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.gz lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.tar.bz2 lingo-randomizer-17778ac3ab8598eb3d43f562a092b9aa7c0a1a42.zip