about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-03 11:51:15 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-03 11:51:15 -0500
commitfd3c94ac13bbd4876766bfc83559d3404c31b963 (patch)
tree25ad9b074a04963097d0181a1036727e21040748
parent4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (diff)
downloadrawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.gz
rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.tar.bz2
rawr-ebooks-fd3c94ac13bbd4876766bfc83559d3404c31b963.zip
Token generator now uses aspell to link different spellings of a word
This is the grand scheme for the multi-formed word design.
-rw-r--r--CMakeLists.txt3
-rw-r--r--kgramstats.cpp59
2 files changed, 58 insertions, 4 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 74b6d51..c0a0d69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ if (YamlCpp_FOUND AND Curl_FOUND)
11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) 11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) 12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) 13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${Curl_LIBRARIES}) 14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${Curl_LIBRARIES} aspell)
15else (YamlCpp_FOUND AND Curl_FOUND) 15else (YamlCpp_FOUND AND Curl_FOUND)
16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") 16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
17endif (YamlCpp_FOUND AND Curl_FOUND) 17endif (YamlCpp_FOUND AND Curl_FOUND)
@@ -19,3 +19,4 @@ endif (YamlCpp_FOUND AND Curl_FOUND)
19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp) 19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) 20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) 21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
22target_link_libraries(rawr-gen aspell)
diff --git a/kgramstats.cpp b/kgramstats.cpp index 7d1d2da..7d28c86 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -42,6 +42,7 @@
42#include "freevars.h" 42#include "freevars.h"
43#include <fstream> 43#include <fstream>
44#include "prefix_search.h" 44#include "prefix_search.h"
45#include <aspell.h>
45 46
46query wildcardQuery {querytype::sentence}; 47query wildcardQuery {querytype::sentence};
47word blank_word {""}; 48word blank_word {""};
@@ -57,6 +58,17 @@ kgramstats::kgramstats(std::string corpus, int maxK)
57 int end = 0; 58 int end = 0;
58 std::set<std::string> thashtags; 59 std::set<std::string> thashtags;
59 freevar fv_emoticons {emoticons, "emoticons.txt"}; 60 freevar fv_emoticons {emoticons, "emoticons.txt"};
61 std::map<std::string, std::string> canonical_form;
62
63 AspellConfig* spell_config = new_aspell_config();
64 AspellCanHaveError* possible_err = new_aspell_speller(spell_config);
65 if (aspell_error_number(possible_err) != 0)
66 {
67 std::cout << "aspell error: " << aspell_error_message(possible_err) << std::endl;
68 exit(1);
69 }
70
71 AspellSpeller* spell_checker = to_aspell_speller(possible_err);
60 72
61 std::cout << "Reading emojis..." << std::endl; 73 std::cout << "Reading emojis..." << std::endl;
62 prefix_search emojis; 74 prefix_search emojis;
@@ -145,12 +157,47 @@ kgramstats::kgramstats(std::string corpus, int maxK)
145 } 157 }
146 158
147 // Basically any other word 159 // Basically any other word
148 if (words.count(canonical) == 0) 160 if (canonical_form.count(canonical) == 0)
149 { 161 {
150 words.emplace(canonical, canonical); 162 if (canonical.find_first_of("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz") == std::string::npos)
163 {
164 // Words with no letters will be mangled by the spell checker
165 canonical_form[canonical] = canonical;
166 words.emplace(canonical, canonical);
167 } else {
168 int correct = aspell_speller_check(spell_checker, canonical.c_str(), canonical.size());
169 if (correct)
170 {
171 words.emplace(canonical, canonical);
172 canonical_form[canonical] = canonical;
173 } else {
174 const AspellWordList* suggestions = aspell_speller_suggest(spell_checker, canonical.c_str(), canonical.size());
175 AspellStringEnumeration* elements = aspell_word_list_elements(suggestions);
176 const char* replacement = aspell_string_enumeration_next(elements);
177 if (replacement != NULL)
178 {
179 aspell_speller_store_replacement(spell_checker, canonical.c_str(), canonical.size(), replacement, strlen(replacement));
180
181 std::string sugrep(replacement);
182 canonical_form[canonical] = sugrep;
183
184 if (words.count(sugrep) == 0)
185 {
186 words.emplace(sugrep, sugrep);
187 }
188 } else {
189 aspell_speller_add_to_session(spell_checker, canonical.c_str(), canonical.size());
190
191 words.emplace(canonical, canonical);
192 canonical_form[canonical] = canonical;
193 }
194
195 delete_aspell_string_enumeration(elements);
196 }
197 }
151 } 198 }
152 199
153 word& tw = words.at(canonical); 200 word& tw = words.at(canonical_form.at(canonical));
154 tw.forms.add(canonical); 201 tw.forms.add(canonical);
155 202
156 return tw; 203 return tw;
@@ -237,6 +284,12 @@ kgramstats::kgramstats(std::string corpus, int maxK)
237 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); 284 start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
238 } 285 }
239 286
287 delete_aspell_speller(spell_checker);
288 delete_aspell_config(spell_config);
289
290 std::cout << canonical_form.size() << " distinct forms" << std::endl;
291 std::cout << words.size() << " distinct words" << std::endl;
292
240 // Time to condense the distribution stuff for the words 293 // Time to condense the distribution stuff for the words
241 std::cout << "Compiling token histograms..." << std::endl; 294 std::cout << "Compiling token histograms..." << std::endl;
242 for (auto& it : words) 295 for (auto& it : words)