diff options
-rw-r--r-- | kgramstats.cpp | 92 | ||||
-rw-r--r-- | kgramstats.h | 12 |
2 files changed, 76 insertions, 28 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 142b5aa..708013f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <vector> | 2 | #include <vector> |
3 | #include <iostream> | 3 | #include <iostream> |
4 | #include <cstdlib> | 4 | #include <cstdlib> |
5 | #include <algorithm> | ||
5 | 6 | ||
6 | kgramstats::kgramstats(string corpus, int maxK) | 7 | kgramstats::kgramstats(string corpus, int maxK) |
7 | { | 8 | { |
@@ -20,34 +21,45 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
20 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 21 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); |
21 | } | 22 | } |
22 | 23 | ||
23 | stats = new map<kgram, map<string, int>* >(); | 24 | stats = new map<kgram, map<string, token_data*>* >(); |
24 | for (int k=0; k<=maxK; k++) | 25 | for (int k=0; k<=maxK; k++) |
25 | { | 26 | { |
26 | for (int i=0; i<(tokens.size() - k); i++) | 27 | for (int i=0; i<(tokens.size() - k); i++) |
27 | { | 28 | { |
28 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 29 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
30 | transform(seq.begin(), seq.end(), seq.begin(), canonize); | ||
29 | string f = tokens[i+k]; | 31 | string f = tokens[i+k]; |
32 | string canonical = canonize(f); | ||
30 | 33 | ||
31 | if ((*stats)[seq] == NULL) | 34 | if ((*stats)[seq] == NULL) |
32 | { | 35 | { |
33 | (*stats)[seq] = new map<string, int>(); | 36 | (*stats)[seq] = new map<string, token_data*>(); |
34 | } | 37 | } |
35 | 38 | ||
36 | (*((*stats)[seq]))[f]++; | 39 | if ((*(*stats)[seq])[canonical] == NULL) |
40 | { | ||
41 | (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data)); | ||
42 | } | ||
43 | |||
44 | token_data* td = stats->at(seq)->at(canonical); | ||
45 | td->all++; | ||
46 | |||
47 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | ||
48 | { | ||
49 | td->period++; | ||
50 | } | ||
51 | |||
52 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
53 | { | ||
54 | td->uppercase++; | ||
55 | } else if (isupper(f[0])) | ||
56 | { | ||
57 | td->titlecase++; | ||
58 | } | ||
37 | } | 59 | } |
38 | } | 60 | } |
39 | } | 61 | } |
40 | 62 | ||
41 | map<string, int>* kgramstats::lookupExts(kgram tk) | ||
42 | { | ||
43 | return (*stats)[tk]; | ||
44 | } | ||
45 | |||
46 | int kgramstats::getMaxK() | ||
47 | { | ||
48 | return maxK; | ||
49 | } | ||
50 | |||
51 | void printKgram(kgram k) | 63 | void printKgram(kgram k) |
52 | { | 64 | { |
53 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 65 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
@@ -76,35 +88,65 @@ vector<string> kgramstats::randomSentence(int n) | |||
76 | } | 88 | } |
77 | } | 89 | } |
78 | } | 90 | } |
79 | 91 | ||
80 | map<string, int>* probtable = lookupExts(cur); | 92 | map<string, token_data*>* probtable = (*stats)[cur]; |
81 | int max = 0; | 93 | int max = 0; |
82 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 94 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
83 | { | 95 | { |
84 | max += it->second; | 96 | max += it->second->all; |
85 | } | 97 | } |
86 | 98 | ||
87 | int r = rand() % (max+1); | 99 | int r = rand() % (max+1); |
88 | string next = probtable->begin()->first; | 100 | map<string, token_data*>::iterator next = probtable->begin(); |
89 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 101 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
90 | { | 102 | { |
91 | if (it->second > r) | 103 | if (it->second->all > r) |
92 | { | 104 | { |
93 | break; | 105 | break; |
94 | } else { | 106 | } else { |
95 | next = it->first; | 107 | next = it; |
96 | r -= it->second; | 108 | r -= it->second->all; |
97 | } | 109 | } |
98 | } | 110 | } |
99 | 111 | ||
112 | string nextToken(next->first); | ||
113 | int casing = rand() % next->second->all; | ||
114 | int period = rand() % next->second->all; | ||
115 | if (casing < next->second->uppercase) | ||
116 | { | ||
117 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
118 | } else if ((casing - next->second->uppercase) < next->second->titlecase) | ||
119 | { | ||
120 | nextToken[0] = toupper(nextToken[0]); | ||
121 | } | ||
122 | |||
123 | if (period < next->second->period) | ||
124 | { | ||
125 | nextToken += "."; | ||
126 | } | ||
127 | |||
128 | cout << next->first << " | " << nextToken << endl; | ||
129 | |||
100 | if (cur.size() == maxK) | 130 | if (cur.size() == maxK) |
101 | { | 131 | { |
102 | cur.pop_front(); | 132 | cur.pop_front(); |
103 | } | 133 | } |
104 | 134 | ||
105 | cur.push_back(next); | 135 | cur.push_back(next->first); |
106 | result.push_back(next); | 136 | result.push_back(nextToken); |
107 | } | 137 | } |
108 | 138 | ||
109 | return result; | 139 | return result; |
140 | } | ||
141 | |||
142 | std::string canonize(std::string f) | ||
143 | { | ||
144 | string canonical(f); | ||
145 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
146 | if (canonical[canonical.length()-1] == '.') | ||
147 | { | ||
148 | canonical.resize(canonical.find('.')); | ||
149 | } | ||
150 | |||
151 | return canonical; | ||
110 | } \ No newline at end of file | 152 | } \ No newline at end of file |
diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -14,15 +14,21 @@ class kgramstats | |||
14 | { | 14 | { |
15 | public: | 15 | public: |
16 | kgramstats(string corpus, int maxK); | 16 | kgramstats(string corpus, int maxK); |
17 | map<string, int>* lookupExts(kgram tk); | ||
18 | int getMaxK(); | ||
19 | vector<string> randomSentence(int n); | 17 | vector<string> randomSentence(int n); |
20 | 18 | ||
21 | private: | 19 | private: |
20 | typedef struct | ||
21 | { | ||
22 | int all; | ||
23 | int titlecase; | ||
24 | int uppercase; | ||
25 | int period; | ||
26 | } token_data; | ||
22 | int maxK; | 27 | int maxK; |
23 | map<kgram, map<string, int>* >* stats; | 28 | map<kgram, map<string, token_data*>* >* stats; |
24 | }; | 29 | }; |
25 | 30 | ||
26 | void printKgram(kgram k); | 31 | void printKgram(kgram k); |
32 | std::string canonize(std::string f); | ||
27 | 33 | ||
28 | #endif \ No newline at end of file | 34 | #endif \ No newline at end of file |