diff options
| author | Feffernoose <fefferburbia@gmail.com> | 2013-10-06 19:52:29 -0400 |
|---|---|---|
| committer | Feffernoose <fefferburbia@gmail.com> | 2013-10-06 19:52:29 -0400 |
| commit | 60561c9b95de1043979fbe59c93342175f9febd8 (patch) | |
| tree | 2af811602d041ef0ac481c3a6637fad37ba21d4d | |
| parent | 8d28a8e13dbe602783a505adb1df375b0d65efe0 (diff) | |
| parent | 3d995835426312809aa7a5a4403ef3984339be0f (diff) | |
| download | rawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.tar.gz rawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.tar.bz2 rawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.zip | |
Merge branch 'master' of http://github.com/hatkirby/rawr-ebooks
| -rw-r--r-- | kgramstats.cpp | 10 |
1 files changed, 8 insertions, 2 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 6c0e4ce..327752b 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -4,6 +4,8 @@ | |||
| 4 | #include <cstdlib> | 4 | #include <cstdlib> |
| 5 | #include <algorithm> | 5 | #include <algorithm> |
| 6 | 6 | ||
| 7 | // runs in O(t^2) time where t is the number of tokens in the input corpus | ||
| 8 | // We consider maxK to be fairly constant | ||
| 7 | kgramstats::kgramstats(string corpus, int maxK) | 9 | kgramstats::kgramstats(string corpus, int maxK) |
| 8 | { | 10 | { |
| 9 | this->maxK = maxK; | 11 | this->maxK = maxK; |
| @@ -16,7 +18,11 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 16 | { | 18 | { |
| 17 | end = corpus.find(" ", start); | 19 | end = corpus.find(" ", start); |
| 18 | 20 | ||
| 19 | tokens.push_back(corpus.substr(start, (end == string::npos) ? string::npos : end - start)); | 21 | string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); |
| 22 | if (token.compare("")) | ||
| 23 | { | ||
| 24 | tokens.push_back(token); | ||
| 25 | } | ||
| 20 | 26 | ||
| 21 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 27 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); |
| 22 | } | 28 | } |
| @@ -85,9 +91,9 @@ void printKgram(kgram k) | |||
| 85 | { | 91 | { |
| 86 | cout << *it << " "; | 92 | cout << *it << " "; |
| 87 | } | 93 | } |
| 88 | cout << endl; | ||
| 89 | } | 94 | } |
| 90 | 95 | ||
| 96 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | ||
| 91 | vector<string> kgramstats::randomSentence(int n) | 97 | vector<string> kgramstats::randomSentence(int n) |
| 92 | { | 98 | { |
| 93 | vector<string> result; | 99 | vector<string> result; |
