diff options
| -rw-r--r-- | kgramstats.cpp | 14 | 
1 files changed, 6 insertions, 8 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index f788cb1..a5402a3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -61,12 +61,6 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 61 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | 61 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | 
| 62 | std::map<std::string, std::string> canonical_form; | 62 | std::map<std::string, std::string> canonical_form; | 
| 63 | 63 | ||
| 64 | // Ensure the old-style freevars exist | ||
| 65 | canonical_form["$name$"] = "$name$"; | ||
| 66 | words.emplace("$name$", std::string("$name$")); | ||
| 67 | canonical_form["$noun$"] = "$noun$"; | ||
| 68 | words.emplace("$noun$", std::string("$noun$")); | ||
| 69 | |||
| 70 | AspellConfig* spell_config = new_aspell_config(); | 64 | AspellConfig* spell_config = new_aspell_config(); | 
| 71 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); | 65 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); | 
| 72 | if (aspell_error_number(possible_err) != 0) | 66 | if (aspell_error_number(possible_err) != 0) | 
| @@ -166,9 +160,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 166 | // Basically any other word | 160 | // Basically any other word | 
| 167 | if (canonical_form.count(canonical) == 0) | 161 | if (canonical_form.count(canonical) == 0) | 
| 168 | { | 162 | { | 
| 169 | if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) | 163 | if ( | 
| 170 | { | 164 | // Legacy freevars should be distinct from tokens containing similar words | 
| 165 | (canonical.find_first_of("$name$") != std::string::npos) || (canonical.find_first_of("$noun$") != std::string::npos) | ||
| 171 | // Words with no letters will be mangled by the spell checker | 166 | // Words with no letters will be mangled by the spell checker | 
| 167 | || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) | ||
| 168 | ) | ||
| 169 | { | ||
| 172 | canonical_form[canonical] = canonical; | 170 | canonical_form[canonical] = canonical; | 
| 173 | words.emplace(canonical, canonical); | 171 | words.emplace(canonical, canonical); | 
| 174 | } else { | 172 | } else { | 
