summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp27
1 files changed, 23 insertions, 4 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -4,6 +4,7 @@
4#include <hkutil/string.h> 4#include <hkutil/string.h>
5 5
6#include <algorithm> 6#include <algorithm>
7#include <filesystem>
7#include <fstream> 8#include <fstream>
8#include <list> 9#include <list>
9#include <regex> 10#include <regex>
@@ -11,6 +12,7 @@
11#include <stdexcept> 12#include <stdexcept>
12#include <string> 13#include <string>
13#include <unordered_map> 14#include <unordered_map>
15#include <unordered_set>
14#include <vector> 16#include <vector>
15 17
16constexpr int MIN_FREQUENCY = 2000000; 18constexpr int MIN_FREQUENCY = 2000000;
@@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) {
49 51
50generator::generator(std::string agidPath, std::string wordNetPath, 52generator::generator(std::string agidPath, std::string wordNetPath,
51 std::string cmudictPath, std::string wordfreqPath, 53 std::string cmudictPath, std::string wordfreqPath,
52 std::string outputPath) 54 std::string datadirPath, std::string outputPath)
53 : agidPath_(agidPath), 55 : agidPath_(agidPath),
54 wordNetPath_(wordNetPath), 56 wordNetPath_(wordNetPath),
55 cmudictPath_(cmudictPath), 57 cmudictPath_(cmudictPath),
56 wordfreqPath_(wordfreqPath), 58 wordfreqPath_(wordfreqPath),
59 datadirPath_(datadirPath),
57 outputPath_(outputPath) { 60 outputPath_(outputPath) {
58 // Ensure AGID infl.txt exists 61 // Ensure AGID infl.txt exists
59 if (!std::ifstream(agidPath_)) { 62 if (!std::ifstream(agidPath_)) {
@@ -102,6 +105,14 @@ void generator::run() {
102 } 105 }
103 } 106 }
104 107
108 std::unordered_set<std::string> profane;
109 {
110 std::list<std::string> lines(readFile(datadirPath_ / "profane.txt"));
111 for (const std::string& line : lines) {
112 profane.insert(line);
113 }
114 }
115
105 { 116 {
106 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); 117 std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
107 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); 118 hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size());
@@ -142,6 +153,11 @@ void generator::run() {
142 continue; 153 continue;
143 } 154 }
144 155
156 // Ignore any profane words.
157 if (profane.count(text)) {
158 continue;
159 }
160
145 // The WordNet data does contain duplicates, so we need to check that we 161 // The WordNet data does contain duplicates, so we need to check that we
146 // haven't already created this word. 162 // haven't already created this word.
147 std::pair<int, int> lookup(synset_id, wnum); 163 std::pair<int, int> lookup(synset_id, wnum);
@@ -175,7 +191,8 @@ void generator::run() {
175 } 191 }
176 192
177 if (!word_by_base_.count(infinitive) && 193 if (!word_by_base_.count(infinitive) &&
178 !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { 194 !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY &&
195 !profane.count(infinitive))) {
179 continue; 196 continue;
180 } 197 }
181 198
@@ -262,8 +279,10 @@ void generator::run() {
262 // Compile the forms we have mapped. 279 // Compile the forms we have mapped.
263 for (const std::list<std::string>& infl_list : inflections) { 280 for (const std::list<std::string>& infl_list : inflections) {
264 for (const std::string& infl : infl_list) { 281 for (const std::string& infl : infl_list) {
265 size_t form_id = LookupOrCreateForm(infl); 282 if (!profane.count(infl)) {
266 AddFormToWord(form_id, word_id); 283 size_t form_id = LookupOrCreateForm(infl);
284 AddFormToWord(form_id, word_id);
285 }
267 } 286 }
268 } 287 }
269 } 288 }