diff options
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 27 |
1 files changed, 23 insertions, 4 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <hkutil/string.h> | 4 | #include <hkutil/string.h> |
5 | 5 | ||
6 | #include <algorithm> | 6 | #include <algorithm> |
7 | #include <filesystem> | ||
7 | #include <fstream> | 8 | #include <fstream> |
8 | #include <list> | 9 | #include <list> |
9 | #include <regex> | 10 | #include <regex> |
@@ -11,6 +12,7 @@ | |||
11 | #include <stdexcept> | 12 | #include <stdexcept> |
12 | #include <string> | 13 | #include <string> |
13 | #include <unordered_map> | 14 | #include <unordered_map> |
15 | #include <unordered_set> | ||
14 | #include <vector> | 16 | #include <vector> |
15 | 17 | ||
16 | constexpr int MIN_FREQUENCY = 2000000; | 18 | constexpr int MIN_FREQUENCY = 2000000; |
@@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) { | |||
49 | 51 | ||
50 | generator::generator(std::string agidPath, std::string wordNetPath, | 52 | generator::generator(std::string agidPath, std::string wordNetPath, |
51 | std::string cmudictPath, std::string wordfreqPath, | 53 | std::string cmudictPath, std::string wordfreqPath, |
52 | std::string outputPath) | 54 | std::string datadirPath, std::string outputPath) |
53 | : agidPath_(agidPath), | 55 | : agidPath_(agidPath), |
54 | wordNetPath_(wordNetPath), | 56 | wordNetPath_(wordNetPath), |
55 | cmudictPath_(cmudictPath), | 57 | cmudictPath_(cmudictPath), |
56 | wordfreqPath_(wordfreqPath), | 58 | wordfreqPath_(wordfreqPath), |
59 | datadirPath_(datadirPath), | ||
57 | outputPath_(outputPath) { | 60 | outputPath_(outputPath) { |
58 | // Ensure AGID infl.txt exists | 61 | // Ensure AGID infl.txt exists |
59 | if (!std::ifstream(agidPath_)) { | 62 | if (!std::ifstream(agidPath_)) { |
@@ -102,6 +105,14 @@ void generator::run() { | |||
102 | } | 105 | } |
103 | } | 106 | } |
104 | 107 | ||
108 | std::unordered_set<std::string> profane; | ||
109 | { | ||
110 | std::list<std::string> lines(readFile(datadirPath_ / "profane.txt")); | ||
111 | for (const std::string& line : lines) { | ||
112 | profane.insert(line); | ||
113 | } | ||
114 | } | ||
115 | |||
105 | { | 116 | { |
106 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | 117 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); |
107 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | 118 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); |
@@ -142,6 +153,11 @@ void generator::run() { | |||
142 | continue; | 153 | continue; |
143 | } | 154 | } |
144 | 155 | ||
156 | // Ignore any profane words. | ||
157 | if (profane.count(text)) { | ||
158 | continue; | ||
159 | } | ||
160 | |||
145 | // The WordNet data does contain duplicates, so we need to check that we | 161 | // The WordNet data does contain duplicates, so we need to check that we |
146 | // haven't already created this word. | 162 | // haven't already created this word. |
147 | std::pair<int, int> lookup(synset_id, wnum); | 163 | std::pair<int, int> lookup(synset_id, wnum); |
@@ -175,7 +191,8 @@ void generator::run() { | |||
175 | } | 191 | } |
176 | 192 | ||
177 | if (!word_by_base_.count(infinitive) && | 193 | if (!word_by_base_.count(infinitive) && |
178 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { | 194 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && |
195 | !profane.count(infinitive))) { | ||
179 | continue; | 196 | continue; |
180 | } | 197 | } |
181 | 198 | ||
@@ -262,8 +279,10 @@ void generator::run() { | |||
262 | // Compile the forms we have mapped. | 279 | // Compile the forms we have mapped. |
263 | for (const std::list<std::string>& infl_list : inflections) { | 280 | for (const std::list<std::string>& infl_list : inflections) { |
264 | for (const std::string& infl : infl_list) { | 281 | for (const std::string& infl : infl_list) { |
265 | size_t form_id = LookupOrCreateForm(infl); | 282 | if (!profane.count(infl)) { |
266 | AddFormToWord(form_id, word_id); | 283 | size_t form_id = LookupOrCreateForm(infl); |
284 | AddFormToWord(form_id, word_id); | ||
285 | } | ||
267 | } | 286 | } |
268 | } | 287 | } |
269 | } | 288 | } |