about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorFeffernoose <fefferburbia@gmail.com>2013-10-06 19:52:29 -0400
committerFeffernoose <fefferburbia@gmail.com>2013-10-06 19:52:29 -0400
commit60561c9b95de1043979fbe59c93342175f9febd8 (patch)
tree2af811602d041ef0ac481c3a6637fad37ba21d4d /kgramstats.cpp
parent8d28a8e13dbe602783a505adb1df375b0d65efe0 (diff)
parent3d995835426312809aa7a5a4403ef3984339be0f (diff)
downloadrawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.tar.gz
rawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.tar.bz2
rawr-ebooks-60561c9b95de1043979fbe59c93342175f9febd8.zip
Merge branch 'master' of http://github.com/hatkirby/rawr-ebooks
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp10
1 files changed, 8 insertions, 2 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 6c0e4ce..327752b 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -4,6 +4,8 @@
4#include <cstdlib> 4#include <cstdlib>
5#include <algorithm> 5#include <algorithm>
6 6
7// runs in O(t^2) time where t is the number of tokens in the input corpus
8// We consider maxK to be fairly constant
7kgramstats::kgramstats(string corpus, int maxK) 9kgramstats::kgramstats(string corpus, int maxK)
8{ 10{
9 this->maxK = maxK; 11 this->maxK = maxK;
@@ -16,7 +18,11 @@ kgramstats::kgramstats(string corpus, int maxK)
16 { 18 {
17 end = corpus.find(" ", start); 19 end = corpus.find(" ", start);
18 20
19 tokens.push_back(corpus.substr(start, (end == string::npos) ? string::npos : end - start)); 21 string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start);
22 if (token.compare(""))
23 {
24 tokens.push_back(token);
25 }
20 26
21 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 27 start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
22 } 28 }
@@ -85,9 +91,9 @@ void printKgram(kgram k)
85 { 91 {
86 cout << *it << " "; 92 cout << *it << " ";
87 } 93 }
88 cout << endl;
89} 94}
90 95
96// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
91vector<string> kgramstats::randomSentence(int n) 97vector<string> kgramstats::randomSentence(int n)
92{ 98{
93 vector<string> result; 99 vector<string> result;