diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:51:15 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:51:15 -0500 |
commit | fd3c94ac13bbd4876766bfc83559d3404c31b963 (patch) | |
tree | 25ad9b074a04963097d0181a1036727e21040748 | |
parent | 4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (diff) | |
download | rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.gz rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.bz2 rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.zip |
Token generator now uses aspell to link different spellings of a word
This is the grand scheme for the multi-formed word design.
-rw-r--r-- | CMakeLists.txt | 3 | ||||
-rw-r--r-- | kgramstats.cpp | 59 |
2 files changed, 58 insertions, 4 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 74b6d51..c0a0d69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt | |||
@@ -11,7 +11,7 @@ if (YamlCpp_FOUND AND Curl_FOUND) | |||
11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) | 11 | add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) |
12 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) | 12 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) |
13 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) | 13 | set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) |
14 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${Curl_LIBRARIES}) | 14 | target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${Curl_LIBRARIES} aspell) |
15 | else (YamlCpp_FOUND AND Curl_FOUND) | 15 | else (YamlCpp_FOUND AND Curl_FOUND) |
16 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") | 16 | message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") |
17 | endif (YamlCpp_FOUND AND Curl_FOUND) | 17 | endif (YamlCpp_FOUND AND Curl_FOUND) |
@@ -19,3 +19,4 @@ endif (YamlCpp_FOUND AND Curl_FOUND) | |||
19 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) | 19 | add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) |
20 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) | 20 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) |
21 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) | 21 | set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) |
22 | target_link_libraries(rawr-gen aspell) | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 7d1d2da..7d28c86 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -42,6 +42,7 @@ | |||
42 | #include "freevars.h" | 42 | #include "freevars.h" |
43 | #include <fstream> | 43 | #include <fstream> |
44 | #include "prefix_search.h" | 44 | #include "prefix_search.h" |
45 | #include <aspell.h> | ||
45 | 46 | ||
46 | query wildcardQuery {querytype::sentence}; | 47 | query wildcardQuery {querytype::sentence}; |
47 | word blank_word {""}; | 48 | word blank_word {""}; |
@@ -57,6 +58,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
57 | int end = 0; | 58 | int end = 0; |
58 | std::set<std::string> thashtags; | 59 | std::set<std::string> thashtags; |
59 | freevar fv_emoticons {emoticons, "emoticons.txt"}; | 60 | freevar fv_emoticons {emoticons, "emoticons.txt"}; |
61 | std::map<std::string, std::string> canonical_form; | ||
62 | |||
63 | AspellConfig* spell_config = new_aspell_config(); | ||
64 | AspellCanHaveError* possible_err = new_aspell_speller(spell_config); | ||
65 | if (aspell_error_number(possible_err) != 0) | ||
66 | { | ||
67 | std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl; | ||
68 | exit(1); | ||
69 | } | ||
70 | |||
71 | AspellSpeller* spell_checker = to_aspell_speller(possible_err); | ||
60 | 72 | ||
61 | std::cout << "Reading emojis..." << std::endl; | 73 | std::cout << "Reading emojis..." << std::endl; |
62 | prefix_search emojis; | 74 | prefix_search emojis; |
@@ -145,12 +157,47 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
145 | } | 157 | } |
146 | 158 | ||
147 | // Basically any other word | 159 | // Basically any other word |
148 | if (words.count(canonical) == 0) | 160 | if (canonical_form.count(canonical) == 0) |
149 | { | 161 | { |
150 | words.emplace(canonical, canonical); | 162 | if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos) |
163 | { | ||
164 | // Words with no letters will be mangled by the spell checker | ||
165 | canonical_form[canonical] = canonical; | ||
166 | words.emplace(canonical, canonical); | ||
167 | } else { | ||
168 | int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size()); | ||
169 | if (correct) | ||
170 | { | ||
171 | words.emplace(canonical, canonical); | ||
172 | canonical_form[canonical] = canonical; | ||
173 | } else { | ||
174 | const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size()); | ||
175 | AspellStringEnumeration* elements = aspell_word_list_elements(suggestions); | ||
176 | const char* replacement = aspell_string_enumeration_next(elements); | ||
177 | if (replacement != NULL) | ||
178 | { | ||
179 | aspell_speller_store_replacement(spell_checker, canonical.c_str(), canonical.size(), replacement, strlen(replacement)); | ||
180 | |||
181 | std::string sugrep(replacement); | ||
182 | canonical_form[canonical] = sugrep; | ||
183 | |||
184 | if (words.count(sugrep) == 0) | ||
185 | { | ||
186 | words.emplace(sugrep, sugrep); | ||
187 | } | ||
188 | } else { | ||
189 | aspell_speller_add_to_session(spell_checker, canonical.c_str(), canonical.size()); | ||
190 | |||
191 | words.emplace(canonical, canonical); | ||
192 | canonical_form[canonical] = canonical; | ||
193 | } | ||
194 | |||
195 | delete_aspell_string_enumeration(elements); | ||
196 | } | ||
197 | } | ||
151 | } | 198 | } |
152 | 199 | ||
153 | word& tw = words.at(canonical); | 200 | word& tw = words.at(canonical_form.at(canonical)); |
154 | tw.forms.add(canonical); | 201 | tw.forms.add(canonical); |
155 | 202 | ||
156 | return tw; | 203 | return tw; |
@@ -237,6 +284,12 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
237 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 284 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
238 | } | 285 | } |
239 | 286 | ||
287 | delete_aspell_speller(spell_checker); | ||
288 | delete_aspell_config(spell_config); | ||
289 | |||
290 | std::cout << canonical_form.size() << " distinct forms" << std::endl; | ||
291 | std::cout << words.size() << " distinct words" << std::endl; | ||
292 | |||
240 | // Time to condense the distribution stuff for the words | 293 | // Time to condense the distribution stuff for the words |
241 | std::cout << "Compiling token histograms..." << std::endl; | 294 | std::cout << "Compiling token histograms..." << std::endl; |
242 | for (auto& it : words) | 295 | for (auto& it : words) |