diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-14 00:07:28 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-14 00:07:28 -0500 |
commit | cee456c24755604e8503038bad5ce653d8065281 (patch) | |
tree | 05330791920bbf4a541b4a13e7da00dd0a5c79ac | |
parent | d62c340f1841c6fc46968643fab63841083aec6f (diff) | |
download | rawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.tar.gz rawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.tar.bz2 rawr-ebooks-cee456c24755604e8503038bad5ce653d8065281.zip |
Fixed problem wherein "$name$'s" was considered a form of "name's"
-rw-r--r-- | kgramstats.cpp | 14 |
1 files changed, 6 insertions, 8 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index f788cb1..a5402a3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -61,12 +61,6 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
61 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | 61 | freevar fv_emoticons {emoticons, "emoticons.txt"}; |
62 | std::map<std::string, std::string> canonical_form; | 62 | std::map<std::string, std::string> canonical_form; |
63 | 63 | ||
64 | // Ensure the old-style freevars exist | ||
65 | canonical_form["$name$"] = "$name$"; | ||
66 | words.emplace("$name$", std::string("$name$")); | ||
67 | canonical_form["$noun$"] = "$noun$"; | ||
68 | words.emplace("$noun$", std::string("$noun$")); | ||
69 | |||
70 | AspellConfig* spell_config = new_aspell_config(); | 64 | AspellConfig* spell_config = new_aspell_config(); |
71 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); | 65 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); |
72 | if (aspell_error_number(possible_err) != 0) | 66 | if (aspell_error_number(possible_err) != 0) |
@@ -166,9 +160,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
166 | // Basically any other word | 160 | // Basically any other word |
167 | if (canonical_form.count(canonical) == 0) | 161 | if (canonical_form.count(canonical) == 0) |
168 | { | 162 | { |
169 | if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) | 163 | if ( |
170 | { | 164 | // Legacy freevars should be distinct from tokens containing similar words |
165 | (canonical.find_first_of("$name$") != std::string::npos) || (canonical.find_first_of("$noun$") != std::string::npos) | ||
171 | // Words with no letters will be mangled by the spell checker | 166 | // Words with no letters will be mangled by the spell checker |
167 | || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) | ||
168 | ) | ||
169 | { | ||
172 | canonical_form[canonical] = canonical; | 170 | canonical_form[canonical] = canonical; |
173 | words.emplace(canonical, canonical); | 171 | words.emplace(canonical, canonical); |
174 | } else { | 172 | } else { |