From cee456c24755604e8503038bad5ce653d8065281 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sun, 14 Feb 2016 00:07:28 -0500 Subject: Fixed problem wherein "$name$'s" was considered a form of "name's" --- kgramstats.cpp | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index f788cb1..a5402a3 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -61,12 +61,6 @@ kgramstats::kgramstats(std::string corpus, int maxK) freevar fv_emoticons {emoticons, "emoticons.txt"}; std::map canonical_form; - // Ensure the old-style freevars exist - canonical_form["$name$"] = "$name$"; - words.emplace("$name$", std::string("$name$")); - canonical_form["$noun$"] = "$noun$"; - words.emplace("$noun$", std::string("$noun$")); - AspellConfig* spell_config = new_aspell_config(); AspellCanHaveError* possible_err = new_aspell_speller(spell_config); if (aspell_error_number(possible_err) != 0) @@ -166,9 +160,13 @@ kgramstats::kgramstats(std::string corpus, int maxK) // Basically any other word if (canonical_form.count(canonical) == 0) { - if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) - { + if ( + // Legacy freevars should be distinct from tokens containing similar words + (canonical.find_first_of("$name$") != std::string::npos) || (canonical.find_first_of("$noun$") != std::string::npos) // Words with no letters will be mangled by the spell checker + || (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) + ) + { canonical_form[canonical] = canonical; words.emplace(canonical, canonical); } else { -- cgit 1.4.1