diff options
| -rw-r--r-- | kgramstats.cpp | 92 | ||||
| -rw-r--r-- | kgramstats.h | 12 |
2 files changed, 76 insertions, 28 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index 142b5aa..708013f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #include <vector> | 2 | #include <vector> |
| 3 | #include <iostream> | 3 | #include <iostream> |
| 4 | #include <cstdlib> | 4 | #include <cstdlib> |
| 5 | #include <algorithm> | ||
| 5 | 6 | ||
| 6 | kgramstats::kgramstats(string corpus, int maxK) | 7 | kgramstats::kgramstats(string corpus, int maxK) |
| 7 | { | 8 | { |
| @@ -20,34 +21,45 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
| 20 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 21 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); |
| 21 | } | 22 | } |
| 22 | 23 | ||
| 23 | stats = new map<kgram, map<string, int>* >(); | 24 | stats = new map<kgram, map<string, token_data*>* >(); |
| 24 | for (int k=0; k<=maxK; k++) | 25 | for (int k=0; k<=maxK; k++) |
| 25 | { | 26 | { |
| 26 | for (int i=0; i<(tokens.size() - k); i++) | 27 | for (int i=0; i<(tokens.size() - k); i++) |
| 27 | { | 28 | { |
| 28 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 29 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
| 30 | transform(seq.begin(), seq.end(), seq.begin(), canonize); | ||
| 29 | string f = tokens[i+k]; | 31 | string f = tokens[i+k]; |
| 32 | string canonical = canonize(f); | ||
| 30 | 33 | ||
| 31 | if ((*stats)[seq] == NULL) | 34 | if ((*stats)[seq] == NULL) |
| 32 | { | 35 | { |
| 33 | (*stats)[seq] = new map<string, int>(); | 36 | (*stats)[seq] = new map<string, token_data*>(); |
| 34 | } | 37 | } |
| 35 | 38 | ||
| 36 | (*((*stats)[seq]))[f]++; | 39 | if ((*(*stats)[seq])[canonical] == NULL) |
| 40 | { | ||
| 41 | (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data)); | ||
| 42 | } | ||
| 43 | |||
| 44 | token_data* td = stats->at(seq)->at(canonical); | ||
| 45 | td->all++; | ||
| 46 | |||
| 47 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | ||
| 48 | { | ||
| 49 | td->period++; | ||
| 50 | } | ||
| 51 | |||
| 52 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
| 53 | { | ||
| 54 | td->uppercase++; | ||
| 55 | } else if (isupper(f[0])) | ||
| 56 | { | ||
| 57 | td->titlecase++; | ||
| 58 | } | ||
| 37 | } | 59 | } |
| 38 | } | 60 | } |
| 39 | } | 61 | } |
| 40 | 62 | ||
| 41 | map<string, int>* kgramstats::lookupExts(kgram tk) | ||
| 42 | { | ||
| 43 | return (*stats)[tk]; | ||
| 44 | } | ||
| 45 | |||
| 46 | int kgramstats::getMaxK() | ||
| 47 | { | ||
| 48 | return maxK; | ||
| 49 | } | ||
| 50 | |||
| 51 | void printKgram(kgram k) | 63 | void printKgram(kgram k) |
| 52 | { | 64 | { |
| 53 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 65 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
| @@ -76,35 +88,65 @@ vector<string> kgramstats::randomSentence(int n) | |||
| 76 | } | 88 | } |
| 77 | } | 89 | } |
| 78 | } | 90 | } |
| 79 | 91 | ||
| 80 | map<string, int>* probtable = lookupExts(cur); | 92 | map<string, token_data*>* probtable = (*stats)[cur]; |
| 81 | int max = 0; | 93 | int max = 0; |
| 82 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 94 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
| 83 | { | 95 | { |
| 84 | max += it->second; | 96 | max += it->second->all; |
| 85 | } | 97 | } |
| 86 | 98 | ||
| 87 | int r = rand() % (max+1); | 99 | int r = rand() % (max+1); |
| 88 | string next = probtable->begin()->first; | 100 | map<string, token_data*>::iterator next = probtable->begin(); |
| 89 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 101 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
| 90 | { | 102 | { |
| 91 | if (it->second > r) | 103 | if (it->second->all > r) |
| 92 | { | 104 | { |
| 93 | break; | 105 | break; |
| 94 | } else { | 106 | } else { |
| 95 | next = it->first; | 107 | next = it; |
| 96 | r -= it->second; | 108 | r -= it->second->all; |
| 97 | } | 109 | } |
| 98 | } | 110 | } |
| 99 | 111 | ||
| 112 | string nextToken(next->first); | ||
| 113 | int casing = rand() % next->second->all; | ||
| 114 | int period = rand() % next->second->all; | ||
| 115 | if (casing < next->second->uppercase) | ||
| 116 | { | ||
| 117 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
| 118 | } else if ((casing - next->second->uppercase) < next->second->titlecase) | ||
| 119 | { | ||
| 120 | nextToken[0] = toupper(nextToken[0]); | ||
| 121 | } | ||
| 122 | |||
| 123 | if (period < next->second->period) | ||
| 124 | { | ||
| 125 | nextToken += "."; | ||
| 126 | } | ||
| 127 | |||
| 128 | cout << next->first << " | " << nextToken << endl; | ||
| 129 | |||
| 100 | if (cur.size() == maxK) | 130 | if (cur.size() == maxK) |
| 101 | { | 131 | { |
| 102 | cur.pop_front(); | 132 | cur.pop_front(); |
| 103 | } | 133 | } |
| 104 | 134 | ||
| 105 | cur.push_back(next); | 135 | cur.push_back(next->first); |
| 106 | result.push_back(next); | 136 | result.push_back(nextToken); |
| 107 | } | 137 | } |
| 108 | 138 | ||
| 109 | return result; | 139 | return result; |
| 140 | } | ||
| 141 | |||
| 142 | std::string canonize(std::string f) | ||
| 143 | { | ||
| 144 | string canonical(f); | ||
| 145 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
| 146 | if (canonical[canonical.length()-1] == '.') | ||
| 147 | { | ||
| 148 | canonical.resize(canonical.find('.')); | ||
| 149 | } | ||
| 150 | |||
| 151 | return canonical; | ||
| 110 | } \ No newline at end of file | 152 | } \ No newline at end of file |
| diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -14,15 +14,21 @@ class kgramstats | |||
| 14 | { | 14 | { |
| 15 | public: | 15 | public: |
| 16 | kgramstats(string corpus, int maxK); | 16 | kgramstats(string corpus, int maxK); |
| 17 | map<string, int>* lookupExts(kgram tk); | ||
| 18 | int getMaxK(); | ||
| 19 | vector<string> randomSentence(int n); | 17 | vector<string> randomSentence(int n); |
| 20 | 18 | ||
| 21 | private: | 19 | private: |
| 20 | typedef struct | ||
| 21 | { | ||
| 22 | int all; | ||
| 23 | int titlecase; | ||
| 24 | int uppercase; | ||
| 25 | int period; | ||
| 26 | } token_data; | ||
| 22 | int maxK; | 27 | int maxK; |
| 23 | map<kgram, map<string, int>* >* stats; | 28 | map<kgram, map<string, token_data*>* >* stats; |
| 24 | }; | 29 | }; |
| 25 | 30 | ||
| 26 | void printKgram(kgram k); | 31 | void printKgram(kgram k); |
| 32 | std::string canonize(std::string f); | ||
| 27 | 33 | ||
| 28 | #endif \ No newline at end of file | 34 | #endif \ No newline at end of file |
