diff options
Diffstat (limited to 'generator/generator.cpp')
| -rw-r--r-- | generator/generator.cpp | 27 | 
1 files changed, 23 insertions, 4 deletions
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 7ab69b5..0309482 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -4,6 +4,7 @@ | |||
| 4 | #include <hkutil/string.h> | 4 | #include <hkutil/string.h> | 
| 5 | 5 | ||
| 6 | #include <algorithm> | 6 | #include <algorithm> | 
| 7 | #include <filesystem> | ||
| 7 | #include <fstream> | 8 | #include <fstream> | 
| 8 | #include <list> | 9 | #include <list> | 
| 9 | #include <regex> | 10 | #include <regex> | 
| @@ -11,6 +12,7 @@ | |||
| 11 | #include <stdexcept> | 12 | #include <stdexcept> | 
| 12 | #include <string> | 13 | #include <string> | 
| 13 | #include <unordered_map> | 14 | #include <unordered_map> | 
| 15 | #include <unordered_set> | ||
| 14 | #include <vector> | 16 | #include <vector> | 
| 15 | 17 | ||
| 16 | constexpr int MIN_FREQUENCY = 2000000; | 18 | constexpr int MIN_FREQUENCY = 2000000; | 
| @@ -49,11 +51,12 @@ std::list<std::string> readFile(std::string path, bool uniq = false) { | |||
| 49 | 51 | ||
| 50 | generator::generator(std::string agidPath, std::string wordNetPath, | 52 | generator::generator(std::string agidPath, std::string wordNetPath, | 
| 51 | std::string cmudictPath, std::string wordfreqPath, | 53 | std::string cmudictPath, std::string wordfreqPath, | 
| 52 | std::string outputPath) | 54 | std::string datadirPath, std::string outputPath) | 
| 53 | : agidPath_(agidPath), | 55 | : agidPath_(agidPath), | 
| 54 | wordNetPath_(wordNetPath), | 56 | wordNetPath_(wordNetPath), | 
| 55 | cmudictPath_(cmudictPath), | 57 | cmudictPath_(cmudictPath), | 
| 56 | wordfreqPath_(wordfreqPath), | 58 | wordfreqPath_(wordfreqPath), | 
| 59 | datadirPath_(datadirPath), | ||
| 57 | outputPath_(outputPath) { | 60 | outputPath_(outputPath) { | 
| 58 | // Ensure AGID infl.txt exists | 61 | // Ensure AGID infl.txt exists | 
| 59 | if (!std::ifstream(agidPath_)) { | 62 | if (!std::ifstream(agidPath_)) { | 
| @@ -102,6 +105,14 @@ void generator::run() { | |||
| 102 | } | 105 | } | 
| 103 | } | 106 | } | 
| 104 | 107 | ||
| 108 | std::unordered_set<std::string> profane; | ||
| 109 | { | ||
| 110 | std::list<std::string> lines(readFile(datadirPath_ / "profane.txt")); | ||
| 111 | for (const std::string& line : lines) { | ||
| 112 | profane.insert(line); | ||
| 113 | } | ||
| 114 | } | ||
| 115 | |||
| 105 | { | 116 | { | 
| 106 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | 117 | std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl")); | 
| 107 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | 118 | hatkirby::progress ppgs("Reading synsets from WordNet...", lines.size()); | 
| @@ -142,6 +153,11 @@ void generator::run() { | |||
| 142 | continue; | 153 | continue; | 
| 143 | } | 154 | } | 
| 144 | 155 | ||
| 156 | // Ignore any profane words. | ||
| 157 | if (profane.count(text)) { | ||
| 158 | continue; | ||
| 159 | } | ||
| 160 | |||
| 145 | // The WordNet data does contain duplicates, so we need to check that we | 161 | // The WordNet data does contain duplicates, so we need to check that we | 
| 146 | // haven't already created this word. | 162 | // haven't already created this word. | 
| 147 | std::pair<int, int> lookup(synset_id, wnum); | 163 | std::pair<int, int> lookup(synset_id, wnum); | 
| @@ -175,7 +191,8 @@ void generator::run() { | |||
| 175 | } | 191 | } | 
| 176 | 192 | ||
| 177 | if (!word_by_base_.count(infinitive) && | 193 | if (!word_by_base_.count(infinitive) && | 
| 178 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY)) { | 194 | !(type == 'V' && word_frequencies[infinitive] >= MIN_FREQUENCY && | 
| 195 | !profane.count(infinitive))) { | ||
| 179 | continue; | 196 | continue; | 
| 180 | } | 197 | } | 
| 181 | 198 | ||
| @@ -262,8 +279,10 @@ void generator::run() { | |||
| 262 | // Compile the forms we have mapped. | 279 | // Compile the forms we have mapped. | 
| 263 | for (const std::list<std::string>& infl_list : inflections) { | 280 | for (const std::list<std::string>& infl_list : inflections) { | 
| 264 | for (const std::string& infl : infl_list) { | 281 | for (const std::string& infl : infl_list) { | 
| 265 | size_t form_id = LookupOrCreateForm(infl); | 282 | if (!profane.count(infl)) { | 
| 266 | AddFormToWord(form_id, word_id); | 283 | size_t form_id = LookupOrCreateForm(infl); | 
| 284 | AddFormToWord(form_id, word_id); | ||
| 285 | } | ||
| 267 | } | 286 | } | 
| 268 | } | 287 | } | 
| 269 | } | 288 | } | 
