diff options
| -rw-r--r-- | emoticons.txt | 11 | ||||
| -rw-r--r-- | kgramstats.cpp | 27 |
2 files changed, 25 insertions, 13 deletions
| diff --git a/emoticons.txt b/emoticons.txt index 21b8990..15382b4 100644 --- a/emoticons.txt +++ b/emoticons.txt | |||
| @@ -1,6 +1,13 @@ | |||
| 1 | :) | 1 | :) |
| 2 | :P | ||
| 2 | :p | 3 | :p |
| 4 | O_O | ||
| 3 | o_o | 5 | o_o |
| 4 | :d | 6 | :D |
| 5 | ;) | 7 | ;) |
| 6 | :o \ No newline at end of file | 8 | :o |
| 9 | :O | ||
| 10 | X_X | ||
| 11 | x_x | ||
| 12 | ^_^ | ||
| 13 | >_< \ No newline at end of file | ||
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 891f4f8..7d1d2da 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -105,11 +105,16 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 105 | } | 105 | } |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | std::string tc(t), canonical; | 108 | std::string tc(t); |
| 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); | 109 | std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); |
| 110 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { | 110 | |
| 111 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '(') && (c != ')') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | 111 | int pst = tc.find_first_not_of("\"([*"); |
| 112 | }); | 112 | int dst = tc.find_last_not_of("\")]*.,?!\n"); |
| 113 | std::string canonical(""); | ||
| 114 | if ((pst != std::string::npos) && (dst != std::string::npos)) | ||
| 115 | { | ||
| 116 | canonical = std::string(tc, pst, dst - pst + 1); | ||
| 117 | } | ||
| 113 | 118 | ||
| 114 | word& w = ([&] () -> word& { | 119 | word& w = ([&] () -> word& { |
| 115 | // Hashtag freevar | 120 | // Hashtag freevar |
| @@ -128,15 +133,15 @@ kgramstats::kgramstats(std::string corpus, int maxK) | |||
| 128 | return emoticons; | 133 | return emoticons; |
| 129 | } | 134 | } |
| 130 | 135 | ||
| 131 | std::string emoticon_canon; | 136 | if ((pst != std::string::npos) && (dst != std::string::npos)) |
| 132 | std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) { | ||
| 133 | return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*')); | ||
| 134 | }); | ||
| 135 | if (fv_emoticons.check(emoticon_canon)) | ||
| 136 | { | 137 | { |
| 137 | emoticons.forms.add(emoticon_canon); | 138 | std::string emoticon_canon(t, pst, t.find_last_not_of("\"]*\n.,?!") - pst + 1); |
| 139 | if (fv_emoticons.check(emoticon_canon)) | ||
| 140 | { | ||
| 141 | emoticons.forms.add(emoticon_canon); | ||
| 138 | 142 | ||
| 139 | return emoticons; | 143 | return emoticons; |
| 144 | } | ||
| 140 | } | 145 | } |
| 141 | 146 | ||
| 142 | // Basically any other word | 147 | // Basically any other word |
