diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:09:52 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:09:52 -0500 |
commit | 4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (patch) | |
tree | 1824cafbba028d33d27d72cc8cf62daff6ec6dc5 | |
parent | 76472b71ecb287b7a3bb5759770d71cdd1623a20 (diff) | |
download | rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.gz rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.bz2 rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.zip |
Terminator characters in the middle of tokens are no longer stripped
Emoticon checking is also now case sensitive, and a few more emoticons were added to the list.
-rw-r--r-- | emoticons.txt | 11 | ||||
-rw-r--r-- | kgramstats.cpp | 27 |
2 files changed, 25 insertions, 13 deletions
diff --git a/emoticons.txt b/emoticons.txt index 21b8990..15382b4 100644 --- a/emoticons.txt +++ b/emoticons.txt | |||
@@ -1,6 +1,13 @@ | |||
1 | :) | 1 | :) |
2 | :P | ||
2 | :p | 3 | :p |
4 | O_O | ||
3 | o_o | 5 | o_o |
4 | :d | 6 | :D |
5 | ;) | 7 | ;) |
6 | :o \ No newline at end of file | 8 | :o |
9 | :O | ||
10 | X_X | ||
11 | x_x | ||
12 | ^_^ | ||
13 | >_< \ No newline at end of file | ||
diff --git a/kgramstats.cpp b/kgramstats.cpp index 891f4f8..7d1d2da 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -105,11 +105,16 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
105 | } | 105 | } |
106 | } | 106 | } |
107 | 107 | ||
108 | std::string tc(t), canonical; | 108 | std::string tc(t); |
109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
110 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | 110 | |
111 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | 111 | int pst = tc.find_first_not_of("\"([*"); |
112 | }); | 112 | int dst = tc.find_last_not_of("\")]*.,?!\n"); |
113 | std::string canonical(""); | ||
114 | if ((pst != std::string::npos) && (dst != std::string::npos)) | ||
115 | { | ||
116 | canonical = std::string(tc, pst, dst - pst + 1); | ||
117 | } | ||
113 | 118 | ||
114 | word& w = ([&] () -> word& { | 119 | word& w = ([&] () -> word& { |
115 | // Hashtag freevar | 120 | // Hashtag freevar |
@@ -128,15 +133,15 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
128 | return emoticons; | 133 | return emoticons; |
129 | } | 134 | } |
130 | 135 | ||
131 | std::string emoticon_canon; | 136 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
132 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { | ||
133 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
134 | }); | ||
135 | if (fv_emoticons.check(emoticon_canon)) | ||
136 | { | 137 | { |
137 | emoticons.forms.add(emoticon_canon); | 138 | std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); |
139 | if (fv_emoticons.check(emoticon_canon)) | ||
140 | { | ||
141 | emoticons.forms.add(emoticon_canon); | ||
138 | 142 | ||
139 | return emoticons; | 143 | return emoticons; |
144 | } | ||
140 | } | 145 | } |
141 | 146 | ||
142 | // Basically any other word | 147 | // Basically any other word |