about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-02-03 11:09:52 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-02-03 11:09:52 -0500
commit4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (patch)
tree1824cafbba028d33d27d72cc8cf62daff6ec6dc5
parent76472b71ecb287b7a3bb5759770d71cdd1623a20 (diff)
downloadrawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.gz
rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.bz2
rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.zip
Terminator characters in the middle of tokens are no longer stripped
Emoticon checking is also now case sensitive, and a few more emoticons were added to the list.
-rw-r--r--emoticons.txt11
-rw-r--r--kgramstats.cpp27
2 files changed, 25 insertions, 13 deletions
diff --git a/emoticons.txt b/emoticons.txt index 21b8990..15382b4 100644 --- a/emoticons.txt +++ b/emoticons.txt
@@ -1,6 +1,13 @@
1:) 1:)
2:P
2:p 3:p
4O_O
3o_o 5o_o
4:d 6:D
5;) 7;)
6:o \ No newline at end of file 8:o
9:O
10X_X
11x_x
12^_^
13>_< \ No newline at end of file
diff --git a/kgramstats.cpp b/kgramstats.cpp index 891f4f8..7d1d2da 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -105,11 +105,16 @@ kgramstats::kgramstats(std::string corpus, int maxK)
105 } 105 }
106 } 106 }
107 107
108 std::string tc(t), canonical; 108 std::string tc(t);
109 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 109 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
110 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { 110
111 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); 111 int pst = tc.find_first_not_of("\"([*");
112 }); 112 int dst = tc.find_last_not_of("\")]*.,?!\n");
113 std::string canonical("");
114 if ((pst != std::string::npos) && (dst != std::string::npos))
115 {
116 canonical = std::string(tc, pst, dst - pst + 1);
117 }
113 118
114 word& w = ([&] () -> word& { 119 word& w = ([&] () -> word& {
115 // Hashtag freevar 120 // Hashtag freevar
@@ -128,15 +133,15 @@ kgramstats::kgramstats(std::string corpus, int maxK)
128 return emoticons; 133 return emoticons;
129 } 134 }
130 135
131 std::string emoticon_canon; 136 if ((pst != std::string::npos) && (dst != std::string::npos))
132 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) {
133 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
134 });
135 if (fv_emoticons.check(emoticon_canon))
136 { 137 {
137 emoticons.forms.add(emoticon_canon); 138 std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1);
139 if (fv_emoticons.check(emoticon_canon))
140 {
141 emoticons.forms.add(emoticon_canon);
138 142
139 return emoticons; 143 return emoticons;
144 }
140 } 145 }
141 146
142 // Basically any other word 147 // Basically any other word