From 0a5c6bd740aff9be53e7ef117e9e926fde3c289e Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Wed, 30 Dec 2015 22:01:37 -0500 Subject: guess what! the algorithm this time it's a literal algorithm again not canonizing away punctuation newlines are actually considered new sentences now we look for the end of a sentence and then start after that --- ebooks.cpp | 26 +++++++++++------- gen.cpp | 16 ++++++++--- kgramstats.cpp | 87 +++++++++++++++++++++++++++++++++++++--------------------- 3 files changed, 84 insertions(+), 45 deletions(-) diff --git a/ebooks.cpp b/ebooks.cpp index 6bbe25e..27591f4 100644 --- a/ebooks.cpp +++ b/ebooks.cpp @@ -24,11 +24,11 @@ int main(int argc, char** args) std::string line; while (getline(infile, line)) { - corpus += " " + line; + corpus += line + "\n "; } std::cout << "Preprocessing corpus..." << std::endl; - kgramstats* stats = new kgramstats(corpus, 3); + kgramstats* stats = new kgramstats(corpus, 4); std::cout << "Preprocessing freevars..." << std::endl; freevars* vars = new freevars(); @@ -38,20 +38,26 @@ int main(int argc, char** args) std::cout << "Generating..." << std::endl; for (;;) { - std::vector doc = stats->randomSentence(rand() % 25 + 5); + std::vector doc = stats->randomSentence(rand() % 45 + 5); std::string hi; for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) { hi += vars->parse(*it) + " "; } - size_t lastperiod = hi.find_last_of("."); - if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) - { - hi = hi.substr(0, lastperiod+1); - } - - hi = hi.substr(0,140); + size_t firstperiod = hi.find_first_of(".!?"); + if (firstperiod != std::string::npos) + { + hi = hi.substr(firstperiod+2); + } + + hi.resize(140); + + size_t lastperiod = hi.find_last_of(".!?"); + if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) + { + hi = hi.substr(0, lastperiod+1); + } twitCurl twitterObj; std::string tmpStr, tmpStr2; diff --git a/gen.cpp b/gen.cpp index 3284ffa..7e47d45 100644 --- a/gen.cpp +++ b/gen.cpp @@ -38,11 +38,11 @@ int main(int argc, char** args) std::string line; while (getline(infile, line)) { - corpus += " " + line; + corpus += line + "\n "; } std::cout << "Preprocessing corpus..." << std::endl; - kgramstats* stats = new kgramstats(corpus, 3); + kgramstats* stats = new kgramstats(corpus, 4); std::cout << "Preprocessing freevars..." << std::endl; freevars* vars = new freevars(); @@ -52,14 +52,22 @@ int main(int argc, char** args) std::cout << "Generating..." << std::endl; for (;;) { - std::vector doc = stats->randomSentence(rand() % 35 + 15); + std::vector doc = stats->randomSentence(rand() % 35 + 45); std::string hi; for (std::vector::iterator it = doc.begin(); it != doc.end(); ++it) { hi += vars->parse(*it) + " "; } + + size_t firstperiod = hi.find_first_of(".!?"); + if (firstperiod != std::string::npos) + { + hi = hi.substr(firstperiod+2); + } + + hi.resize(140); - size_t lastperiod = hi.find_last_of("."); + size_t lastperiod = hi.find_last_of(".!?"); if ((lastperiod != std::string::npos) && (rand() % 3 > 0)) { hi = hi.substr(0, lastperiod+1); diff --git a/kgramstats.cpp b/kgramstats.cpp index 41517ca..b0ec68a 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp @@ -22,7 +22,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) end = corpus.find(" ", start); std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); - if (token.compare("")) + if (token[token.length()-1] == '\n') + { + if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?')) + { + token.insert(token.length()-1, "."); + } + + token.resize(token.length()-1); + } + + if (token.compare("") && token.compare(".")) { mstats.addWord(token); tokens.push_back(token); @@ -34,14 +44,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) std::map* > tstats; bool newSentence = true; bool newClause = false; - for (int k=0; k<=maxK; k++) + for (int k=0; ktoken = new std::string(canonical); td->all++; - if (newSentence) + /*if (newSentence) { kgram newKgram(1, "."); if (tstats[newKgram] == NULL) @@ -70,7 +83,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) newSentence = false; } - if (newClause || newSentence) + if (newClause) { kgram commaKgram(1, ","); if (tstats[commaKgram] == NULL) @@ -156,7 +169,7 @@ kgramstats::kgramstats(std::string corpus, int maxK) } } } - } + }*/ if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) { @@ -165,6 +178,17 @@ kgramstats::kgramstats(std::string corpus, int maxK) { td->titlecase++; } + + /*if (k != 0) + { + if (newSentence) + { + i += k; + } + + newSentence = false; + newClause = false; + }*/ } } @@ -201,16 +225,21 @@ std::vector kgramstats::randomSentence(int n) std::vector result; kgram newKgram(1, "."); kgram commaKgram(1, ","); - std::list cur = newKgram; + std::list cur; int cuts = 0; for (int i=0; i 0) && (cur != newKgram)) + if (cur.size() == maxK) + { + cur.pop_front(); + } + + if ((cur.size() > 0) && (cur != newKgram)) { if (rand() % (maxK - cur.size() + 1) == 0) { - while (cur.size() > 1) + while (cur.size() > 2) { if ((rand() % (n)) < cuts) { @@ -223,7 +252,7 @@ std::vector kgramstats::randomSentence(int n) } cuts++; - }*/ + } std::map distribution = *(*stats)[cur]; int max = distribution.rbegin()->first; @@ -232,12 +261,19 @@ std::vector kgramstats::randomSentence(int n) std::string nextToken(*(next->token)); int casing = rand() % next->all; - int period = rand() % next->all; + /*int period = rand() % next->all; int startparen = rand() % next->all; int endparen = rand() % next->all; int startquote = rand() % next->all; int endquote = rand() % next->all; - int comma = rand() % next->all; + int comma = rand() % next->all;*/ + + bool mess = (rand() % 100) == 0; + if (mess) + { + nextToken = mstats.alternate(nextToken); + } + if (casing < next->uppercase) { std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); @@ -247,14 +283,8 @@ std::vector kgramstats::randomSentence(int n) { nextToken[0] = toupper(nextToken[0]); } - - bool mess = (rand() % 100) == 0; - if (mess) - { - nextToken = mstats.alternate(nextToken); - } - if (startquote < next->startquote) + /*if (startquote < next->startquote) { nextToken = "\"" + nextToken; } else if (startparen < next->startparen) @@ -294,12 +324,7 @@ std::vector kgramstats::randomSentence(int n) } nextToken += ","; - } - - if (cur.size() == maxK) - { - cur.pop_front(); - } + }*/ /* DEBUG */ for (kgram::iterator it = cur.begin(); it != cur.end(); it++) @@ -316,18 +341,18 @@ std::vector kgramstats::randomSentence(int n) std::cout << std::endl; - if ((cur == newKgram) || (cur == commaKgram)) + /*if ((cur == newKgram) || (cur == commaKgram)) { cur.pop_front(); } - if ((period < next->period) && ((rand() % 3) == 0)) + if (period < next->period)// && ((rand() % 3) != 0)) { cur = newKgram; } else if ((comma < next->comma) && ((rand() % 3) == 0)) { cur = commaKgram; - } else { + } else {*/ //if (mess && (rand() % 2 == 0)) if (false) { @@ -337,7 +362,7 @@ std::vector kgramstats::randomSentence(int n) } else { cur.push_back(*(next->token)); } - } + //} result.push_back(nextToken); } @@ -347,7 +372,7 @@ std::vector kgramstats::randomSentence(int n) bool removeIf(char c) { - return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',')); + return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n')); } std::string canonize(std::string f) @@ -358,5 +383,5 @@ std::string canonize(std::string f) std::string result; std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf); - return result; + return canonical; } -- cgit 1.4.1