diff options
| author | Feffernoose <fefferburbia@gmail.com> | 2013-10-01 18:15:22 -0400 |
|---|---|---|
| committer | Feffernoose <fefferburbia@gmail.com> | 2013-10-01 18:15:22 -0400 |
| commit | 8de3134bf2cd26ff81359df703e5fbc6280448d7 (patch) | |
| tree | 325e99abac6b7e3316334af7961645e9381e6517 /kgramstats.cpp | |
| parent | 2b1f8c3363ef667bc20f33bbb5a856a35f2591ba (diff) | |
| download | rawr-ebooks-8de3134bf2cd26ff81359df703e5fbc6280448d7.tar.gz rawr-ebooks-8de3134bf2cd26ff81359df703e5fbc6280448d7.tar.bz2 rawr-ebooks-8de3134bf2cd26ff81359df703e5fbc6280448d7.zip | |
Wrote program
Diffstat (limited to 'kgramstats.cpp')
| -rw-r--r-- | kgramstats.cpp | 110 |
1 files changed, 110 insertions, 0 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp new file mode 100644 index 0000000..142b5aa --- /dev/null +++ b/kgramstats.cpp | |||
| @@ -0,0 +1,110 @@ | |||
| 1 | #include "kgramstats.h" | ||
| 2 | #include <vector> | ||
| 3 | #include <iostream> | ||
| 4 | #include <cstdlib> | ||
| 5 | |||
| 6 | kgramstats::kgramstats(string corpus, int maxK) | ||
| 7 | { | ||
| 8 | this->maxK = maxK; | ||
| 9 | |||
| 10 | vector<string> tokens; | ||
| 11 | int start = 0; | ||
| 12 | int end = 0; | ||
| 13 | |||
| 14 | while (end != string::npos) | ||
| 15 | { | ||
| 16 | end = corpus.find(" ", start); | ||
| 17 | |||
| 18 | tokens.push_back(corpus.substr(start, (end == string::npos) ? string::npos : end - start)); | ||
| 19 | |||
| 20 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | ||
| 21 | } | ||
| 22 | |||
| 23 | stats = new map<kgram, map<string, int>* >(); | ||
| 24 | for (int k=0; k<=maxK; k++) | ||
| 25 | { | ||
| 26 | for (int i=0; i<(tokens.size() - k); i++) | ||
| 27 | { | ||
| 28 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | ||
| 29 | string f = tokens[i+k]; | ||
| 30 | |||
| 31 | if ((*stats)[seq] == NULL) | ||
| 32 | { | ||
| 33 | (*stats)[seq] = new map<string, int>(); | ||
| 34 | } | ||
| 35 | |||
| 36 | (*((*stats)[seq]))[f]++; | ||
| 37 | } | ||
| 38 | } | ||
| 39 | } | ||
| 40 | |||
| 41 | map<string, int>* kgramstats::lookupExts(kgram tk) | ||
| 42 | { | ||
| 43 | return (*stats)[tk]; | ||
| 44 | } | ||
| 45 | |||
| 46 | int kgramstats::getMaxK() | ||
| 47 | { | ||
| 48 | return maxK; | ||
| 49 | } | ||
| 50 | |||
| 51 | void printKgram(kgram k) | ||
| 52 | { | ||
| 53 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | ||
| 54 | { | ||
| 55 | cout << *it << " "; | ||
| 56 | } | ||
| 57 | cout << endl; | ||
| 58 | } | ||
| 59 | |||
| 60 | vector<string> kgramstats::randomSentence(int n) | ||
| 61 | { | ||
| 62 | vector<string> result; | ||
| 63 | list<string> cur; | ||
| 64 | |||
| 65 | for (int i=0; i<n; i++) | ||
| 66 | { | ||
| 67 | if ((rand() % 4) != 0) | ||
| 68 | { | ||
| 69 | for (int i=0; i<cur.size(); i++) | ||
| 70 | { | ||
| 71 | if ((rand() % 3) != 0) | ||
| 72 | { | ||
| 73 | cur.pop_front(); | ||
| 74 | } else { | ||
| 75 | break; | ||
| 76 | } | ||
| 77 | } | ||
| 78 | } | ||
| 79 | |||
| 80 | map<string, int>* probtable = lookupExts(cur); | ||
| 81 | int max = 0; | ||
| 82 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | ||
| 83 | { | ||
| 84 | max += it->second; | ||
| 85 | } | ||
| 86 | |||
| 87 | int r = rand() % (max+1); | ||
| 88 | string next = probtable->begin()->first; | ||
| 89 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | ||
| 90 | { | ||
| 91 | if (it->second > r) | ||
| 92 | { | ||
| 93 | break; | ||
| 94 | } else { | ||
| 95 | next = it->first; | ||
| 96 | r -= it->second; | ||
| 97 | } | ||
| 98 | } | ||
| 99 | |||
| 100 | if (cur.size() == maxK) | ||
| 101 | { | ||
| 102 | cur.pop_front(); | ||
| 103 | } | ||
| 104 | |||
| 105 | cur.push_back(next); | ||
| 106 | result.push_back(next); | ||
| 107 | } | ||
| 108 | |||
| 109 | return result; | ||
| 110 | } \ No newline at end of file | ||
