From 0a5c6bd740aff9be53e7ef117e9e926fde3c289e Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Wed, 30 Dec 2015 22:01:37 -0500 Subject: guess what! the algorithm this time it's a literal algorithm again not canonizing away punctuation newlines are actually considered new sentences now we look for the end of a sentence and then start after that --- kgramstats.cpp | 87 +++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 31 deletions(-) (limited to 'kgramstats.cpp') diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) end = corpus.find(" ", start); std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); - if (token.compare("")) + if (token[token.length()-1] == '\n') + { + if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) + { + token.insert(token.length()-1, "."); + } + + token.resize(token.length()-1); + } + + if (token.compare("") && token.compare(".")) { mstats.addWord(token); tokens.push_back(token); @@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) std::map* > tstats; bool newSentence = true; bool newClause = false; - for (int k=0; k<=maxK; k++) + for (int k=0; ktoken = new std::string(canonical); td->all++; - if (newSentence) + /*if (newSentence) { kgram newKgram(1, "."); if (tstats[newKgram] == NULL) @@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) newSentence = false; } - if (newClause || newSentence) + if (newClause) { kgram commaKgram(1, ","); if (tstats[commaKgram] == NULL) @@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) } } } - } + }*/ if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) { @@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) { td->titlecase++; } + + /*if (k != 0) + { + if (newSentence) + { + i += k; + } + + newSentence = false; + newClause = false; + }*/ } } @@ -201,16 +225,21 @@ std::vector kgramstats::randomSentence(int n) std::vector result; kgram newKgram(1, "."); kgram commaKgram(1, ","); - std::list cur = newKgram; + std::list cur; int cuts = 0; for (int i=0; i 0) && (cur != newKgram)) + if (cur.size() == maxK) + { + cur.pop_front(); + } + + if ((cur.size() > 0) && (cur != newKgram)) { if (rand() % (maxK - cur.size() + 1) == 0) { - while (cur.size() > 1) + while (cur.size() > 2) { if ((rand() % (n)) < cuts) { @@ -223,7 +252,7 @@ std::vector kgramstats::randomSentence(int n) } cuts++; - }*/ + } std::map distribution = *(*stats)[cur]; int max = distribution.rbegin()->first; @@ -232,12 +261,19 @@ std::vector kgramstats::randomSentence(int n) std::string nextToken(*(next->token)); int casing = rand() % next->all; - int period = rand() % next->all; + /*int period = rand() % next->all; int startparen = rand() % next->all; int endparen = rand() % next->all; int startquote = rand() % next->all; int endquote = rand() % next->all; - int comma = rand() % next->all; + int comma = rand() % next->all;*/ + + bool mess = (rand() % 100) == 0; + if (mess) + { + nextToken = mstats.alternate(nextToken); + } + if (casing < next->uppercase) { std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); @@ -247,14 +283,8 @@ std::vector kgramstats::randomSentence(int n) { nextToken[0] = toupper(nextToken[0]); } - - bool mess = (rand() % 100) == 0; - if (mess) - { - nextToken = mstats.alternate(nextToken); - } - if (startquote < next->startquote) + /*if (startquote < next->startquote) { nextToken = "\"" + nextToken; } else if (startparen < next->startparen) @@ -294,12 +324,7 @@ std::vector kgramstats::randomSentence(int n) } nextToken += ","; - } - - if (cur.size() == maxK) - { - cur.pop_front(); - } + }*/ /* DEBUG */ for (kgram::iterator it = cur.begin(); it != cur.end(); it++) @@ -316,18 +341,18 @@ std::vector kgramstats::randomSentence(int n) std::cout << std::endl; - if ((cur == newKgram) || (cur == commaKgram)) + /*if ((cur == newKgram) || (cur == commaKgram)) { cur.pop_front(); } - if ((period < next->period) && ((rand() % 3) == 0)) + if (period < next->period)// && ((rand() % 3) != 0)) { cur = newKgram; } else if ((comma < next->comma) && ((rand() % 3) == 0)) { cur = commaKgram; - } else { + } else {*/ //if (mess && (rand() % 2 == 0)) if (false) { @@ -337,7 +362,7 @@ std::vector kgramstats::randomSentence(int n) } else { cur.push_back(*(next->token)); } - } + //} result.push_back(nextToken); } @@ -347,7 +372,7 @@ std::vector kgramstats::randomSentence(int n) bool removeIf(char c) { - return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); + return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); } std::string canonize(std::string f) @@ -358,5 +383,5 @@ std::string canonize(std::string f) std::string result; std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); - return result; + return canonical; } -- cgit 1.4.1