about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-14 00:07:28 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-14 00:07:28 -0500
commitcee456c24755604e8503038bad5ce653d8065281 (patch)
tree05330791920bbf4a541b4a13e7da00dd0a5c79ac
parentd62c340f1841c6fc46968643fab63841083aec6f (diff)
downloadrawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.tar.gz
rawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.tar.bz2
rawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.zip
Fixed problem wherein "$name$'s" was considered a form of "name's"
-rw-r--r--kgramstats.cpp14
1 files changed, 6 insertions, 8 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index f788cb1..a5402a3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -61,12 +61,6 @@ kgramstats::kgramstats(std::string corpus, int maxK)
61 freevar fv_emoticons {emoticons, "emoticons.txt"}; 61 freevar fv_emoticons {emoticons, "emoticons.txt"};
62 std::map<std::string, std::string> canonical_form; 62 std::map<std::string, std::string> canonical_form;
63 63
64 // Ensure the old-style freevars exist
65 canonical_form["$name$"] = "$name$";
66 words.emplace("$name$", std::string("$name$"));
67 canonical_form["$noun$"] = "$noun$";
68 words.emplace("$noun$", std::string("$noun$"));
69
70 AspellConfig* spell_config = new_aspell_config(); 64 AspellConfig* spell_config = new_aspell_config();
71 AspellCanHaveError* possible_err = new_aspell_speller(spell_config); 65 AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
72 if (aspell_error_number(possible_err) != 0) 66 if (aspell_error_number(possible_err) != 0)
@@ -166,9 +160,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
166 // Basically any other word 160 // Basically any other word
167 if (canonical_form.count(canonical) == 0) 161 if (canonical_form.count(canonical) == 0)
168 { 162 {
169 if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) 163 if (
170 { 164 // Legacy freevars should be distinct from tokens containing similar words
165 (canonical.find_first_of("$name$") != std::string::npos) || (canonical.find_first_of("$noun$") != std::string::npos)
171 // Words with no letters will be mangled by the spell checker 166 // Words with no letters will be mangled by the spell checker
167 || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
168 )
169 {
172 canonical_form[canonical] = canonical; 170 canonical_form[canonical] = canonical;
173 words.emplace(canonical, canonical); 171 words.emplace(canonical, canonical);
174 } else { 172 } else {