diff options
| author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:09:52 -0500 |
|---|---|---|
| committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-02-03 11:09:52 -0500 |
| commit | 4c7c1ca17371d8ff1d709e1b263d2034afa624a1 (patch) | |
| tree | 1824cafbba028d33d27d72cc8cf62daff6ec6dc5 /kgramstats.cpp | |
| parent | 76472b71ecb287b7a3bb5759770d71cdd1623a20 (diff) | |
| download | rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.gz rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.tar.bz2 rawr-ebooks-4c7c1ca17371d8ff1d709e1b263d2034afa624a1.zip | |
Terminator characters in the middle of tokens are no longer stripped
Emoticon checking is also now case sensitive, and a few more emoticons were added to the list.
Diffstat (limited to 'kgramstats.cpp')
| -rw-r--r-- | kgramstats.cpp | 27 |
1 files changed, 16 insertions, 11 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 891f4f8..7d1d2da 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -105,11 +105,16 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 105 | } | 105 | } |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | std::string tc(t), canonical; | 108 | std::string tc(t); |
| 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
| 110 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | 110 | |
| 111 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | 111 | int pst = tc.find_first_not_of("\"([*"); |
| 112 | }); | 112 | int dst = tc.find_last_not_of("\")]*.,?!\n"); |
| 113 | std::string canonical(""); | ||
| 114 | if ((pst != std::string::npos) && (dst != std::string::npos)) | ||
| 115 | { | ||
| 116 | canonical = std::string(tc, pst, dst - pst + 1); | ||
| 117 | } | ||
| 113 | 118 | ||
| 114 | word& w = ([&] () -> word& { | 119 | word& w = ([&] () -> word& { |
| 115 | // Hashtag freevar | 120 | // Hashtag freevar |
| @@ -128,15 +133,15 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 128 | return emoticons; | 133 | return emoticons; |
| 129 | } | 134 | } |
| 130 | 135 | ||
| 131 | std::string emoticon_canon; | 136 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
| 132 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { | ||
| 133 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
| 134 | }); | ||
| 135 | if (fv_emoticons.check(emoticon_canon)) | ||
| 136 | { | 137 | { |
| 137 | emoticons.forms.add(emoticon_canon); | 138 | std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); |
| 139 | if (fv_emoticons.check(emoticon_canon)) | ||
| 140 | { | ||
| 141 | emoticons.forms.add(emoticon_canon); | ||
| 138 | 142 | ||
| 139 | return emoticons; | 143 | return emoticons; |
| 144 | } | ||
| 140 | } | 145 | } |
| 141 | 146 | ||
| 142 | // Basically any other word | 147 | // Basically any other word |
