diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:51:15 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:51:15 -0500 |
| commit | fd3c94ac13bbd4876766bfc83559d3404c31b963 (patch) | |
| tree | 25ad9b074a04963097d0181a1036727e21040748 /kgramstats.cpp | |
| parent | 4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (diff) | |
| download | rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.gz rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.bz2 rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.zip | |
Token generator now uses aspell to link different spellings of a word
This is the grand scheme for the multi-formed word design.
Diffstat (limited to 'kgramstats.cpp')
| -rw-r--r-- | kgramstats.cpp | 59 |
1 files changed, 56 insertions, 3 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 7d1d2da..7d28c86 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -42,6 +42,7 @@ | |||
| 42 | #include "freevars.h" | 42 | #include "freevars.h" |
| 43 | #include <fstream> | 43 | #include <fstream> |
| 44 | #include "prefix_search.h" | 44 | #include "prefix_search.h" |
| 45 | #include <aspell.h> | ||
| 45 | 46 | ||
| 46 | query wildcardQuery {querytype::sentence}; | 47 | query wildcardQuery {querytype::sentence}; |
| 47 | word blank_word {""}; | 48 | word blank_word {""}; |
| @@ -57,6 +58,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 57 | int end = 0; | 58 | int end = 0; |
| 58 | std::set<std::string> thashtags; | 59 | std::set<std::string> thashtags; |
| 59 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | 60 | freevar fv_emoticons {emoticons, "emoticons.txt"}; |
| 61 | std::map<std::string, std::string> canonical_form; | ||
| 62 | |||
| 63 | AspellConfig* spell_config = new_aspell_config(); | ||
| 64 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); | ||
| 65 | if (aspell_error_number(possible_err) != 0) | ||
| 66 | { | ||
| 67 | std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl; | ||
| 68 | exit(1); | ||
| 69 | } | ||
| 70 | |||
| 71 | AspellSpeller* spell_checker = to_aspell_speller(possible_err); | ||
| 60 | 72 | ||
| 61 | std::cout << "Reading emojis..." << std::endl; | 73 | std::cout << "Reading emojis..." << std::endl; |
| 62 | prefix_search emojis; | 74 | prefix_search emojis; |
| @@ -145,12 +157,47 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 145 | } | 157 | } |
| 146 | 158 | ||
| 147 | // Basically any other word | 159 | // Basically any other word |
| 148 | if (words.count(canonical) == 0) | 160 | if (canonical_form.count(canonical) == 0) |
| 149 | { | 161 | { |
| 150 | words.emplace(canonical, canonical); | 162 | if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) |
| 163 | { | ||
| 164 | // Words with no letters will be mangled by the spell checker | ||
| 165 | canonical_form[canonical] = canonical; | ||
| 166 | words.emplace(canonical, canonical); | ||
| 167 | } else { | ||
| 168 | int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); | ||
| 169 | if (correct) | ||
| 170 | { | ||
| 171 | words.emplace(canonical, canonical); | ||
| 172 | canonical_form[canonical] = canonical; | ||
| 173 | } else { | ||
| 174 | const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); | ||
| 175 | AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); | ||
| 176 | const char* replacement = aspell_string_enumeration_next(elements); | ||
| 177 | if (replacement != NULL) | ||
| 178 | { | ||
| 179 | aspell_speller_store_replacement(spell_checker, canonical.c_str(), canonical.size(), replacement, strlen(replacement)); | ||
| 180 | |||
| 181 | std::string sugrep(replacement); | ||
| 182 | canonical_form[canonical] = sugrep; | ||
| 183 | |||
| 184 | if (words.count(sugrep) == 0) | ||
| 185 | { | ||
| 186 | words.emplace(sugrep, sugrep); | ||
| 187 | } | ||
| 188 | } else { | ||
| 189 | aspell_speller_add_to_session(spell_checker, canonical.c_str(), canonical.size()); | ||
| 190 | |||
| 191 | words.emplace(canonical, canonical); | ||
| 192 | canonical_form[canonical] = canonical; | ||
| 193 | } | ||
| 194 | |||
| 195 | delete_aspell_string_enumeration(elements); | ||
| 196 | } | ||
| 197 | } | ||
| 151 | } | 198 | } |
| 152 | 199 | ||
| 153 | word& tw = words.at(canonical); | 200 | word& tw = words.at(canonical_form.at(canonical)); |
| 154 | tw.forms.add(canonical); | 201 | tw.forms.add(canonical); |
| 155 | 202 | ||
| 156 | return tw; | 203 | return tw; |
| @@ -237,6 +284,12 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 237 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 284 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 238 | } | 285 | } |
| 239 | 286 | ||
| 287 | delete_aspell_speller(spell_checker); | ||
| 288 | delete_aspell_config(spell_config); | ||
| 289 | |||
| 290 | std::cout << canonical_form.size() << " distinct forms" << std::endl; | ||
| 291 | std::cout << words.size() << " distinct words" << std::endl; | ||
| 292 | |||
| 240 | // Time to condense the distribution stuff for the words | 293 | // Time to condense the distribution stuff for the words |
| 241 | std::cout << "Compiling token histograms..." << std::endl; | 294 | std::cout << "Compiling token histograms..." << std::endl; |
| 242 | for (auto& it : words) | 295 | for (auto& it : words) |
