From 8e2693ebb6670f8b8c66f5818be87140467ed5f3 Mon Sep 17 00:00:00 2001 From: Feffernoose Date: Sun, 6 Oct 2013 15:03:29 -0400 Subject: Stripped empty tokens from corpus --- kgramstats.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index 6c0e4ce..327752b 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -4,6 +4,8 @@ #include #include +// runs in O(t^2) time where t is the number of tokens in the input corpus +// We consider maxK to be fairly constant kgramstats::kgramstats(string corpus, int maxK) { this->maxK = maxK; @@ -16,7 +18,11 @@ kgramstats::kgramstats(string corpus, int maxK) { end = corpus.find(" ", start); - tokens.push_back(corpus.substr(start, (end == string::npos) ? string::npos : end - start)); + string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start); + if (token.compare("")) + { + tokens.push_back(token); + } start = ((end > (string::npos - 1) ) ? string::npos : end + 1); } @@ -85,9 +91,9 @@ void printKgram(kgram k) { cout << *it << " "; } - cout << endl; } +// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus vector kgramstats::randomSentence(int n) { vector result; -- cgit 1.4.1