about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--kgramstats.cpp92
-rw-r--r--kgramstats.h12
2 files changed, 76 insertions, 28 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 142b5aa..708013f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -2,6 +2,7 @@
2#include <vector> 2#include <vector>
3#include <iostream> 3#include <iostream>
4#include <cstdlib> 4#include <cstdlib>
5#include <algorithm>
5 6
6kgramstats::kgramstats(string corpus, int maxK) 7kgramstats::kgramstats(string corpus, int maxK)
7{ 8{
@@ -20,34 +21,45 @@ kgramstats::kgramstats(string corpus, int maxK)
20 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 21 start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
21 } 22 }
22 23
23 stats = new map<kgram, map<string, int>* >(); 24 stats = new map<kgram, map<string, token_data*>* >();
24 for (int k=0; k<=maxK; k++) 25 for (int k=0; k<=maxK; k++)
25 { 26 {
26 for (int i=0; i<(tokens.size() - k); i++) 27 for (int i=0; i<(tokens.size() - k); i++)
27 { 28 {
28 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 29 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
30 transform(seq.begin(), seq.end(), seq.begin(), canonize);
29 string f = tokens[i+k]; 31 string f = tokens[i+k];
32 string canonical = canonize(f);
30 33
31 if ((*stats)[seq] == NULL) 34 if ((*stats)[seq] == NULL)
32 { 35 {
33 (*stats)[seq] = new map<string, int>(); 36 (*stats)[seq] = new map<string, token_data*>();
34 } 37 }
35 38
36 (*((*stats)[seq]))[f]++; 39 if ((*(*stats)[seq])[canonical] == NULL)
40 {
41 (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
42 }
43
44 token_data* td = stats->at(seq)->at(canonical);
45 td->all++;
46
47 if ((f.length() > 0) && (f[f.length()-1] == '.'))
48 {
49 td->period++;
50 }
51
52 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
53 {
54 td->uppercase++;
55 } else if (isupper(f[0]))
56 {
57 td->titlecase++;
58 }
37 } 59 }
38 } 60 }
39} 61}
40 62
41map<string, int>* kgramstats::lookupExts(kgram tk)
42{
43 return (*stats)[tk];
44}
45
46int kgramstats::getMaxK()
47{
48 return maxK;
49}
50
51void printKgram(kgram k) 63void printKgram(kgram k)
52{ 64{
53 for (kgram::iterator it = k.begin(); it != k.end(); it++) 65 for (kgram::iterator it = k.begin(); it != k.end(); it++)
@@ -76,35 +88,65 @@ vector<string> kgramstats::randomSentence(int n)
76 } 88 }
77 } 89 }
78 } 90 }
79 91
80 map<string, int>* probtable = lookupExts(cur); 92 map<string, token_data*>* probtable = (*stats)[cur];
81 int max = 0; 93 int max = 0;
82 for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) 94 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it)
83 { 95 {
84 max += it->second; 96 max += it->second->all;
85 } 97 }
86 98
87 int r = rand() % (max+1); 99 int r = rand() % (max+1);
88 string next = probtable->begin()->first; 100 map<string, token_data*>::iterator next = probtable->begin();
89 for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) 101 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it)
90 { 102 {
91 if (it->second > r) 103 if (it->second->all > r)
92 { 104 {
93 break; 105 break;
94 } else { 106 } else {
95 next = it->first; 107 next = it;
96 r -= it->second; 108 r -= it->second->all;
97 } 109 }
98 } 110 }
99 111
112 string nextToken(next->first);
113 int casing = rand() % next->second->all;
114 int period = rand() % next->second->all;
115 if (casing < next->second->uppercase)
116 {
117 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
118 } else if ((casing - next->second->uppercase) < next->second->titlecase)
119 {
120 nextToken[0] = toupper(nextToken[0]);
121 }
122
123 if (period < next->second->period)
124 {
125 nextToken += ".";
126 }
127
128 cout << next->first << " | " << nextToken << endl;
129
100 if (cur.size() == maxK) 130 if (cur.size() == maxK)
101 { 131 {
102 cur.pop_front(); 132 cur.pop_front();
103 } 133 }
104 134
105 cur.push_back(next); 135 cur.push_back(next->first);
106 result.push_back(next); 136 result.push_back(nextToken);
107 } 137 }
108 138
109 return result; 139 return result;
140}
141
142std::string canonize(std::string f)
143{
144 string canonical(f);
145 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
146 if (canonical[canonical.length()-1] == '.')
147 {
148 canonical.resize(canonical.find('.'));
149 }
150
151 return canonical;
110} \ No newline at end of file 152} \ No newline at end of file
diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -14,15 +14,21 @@ class kgramstats
14{ 14{
15public: 15public:
16 kgramstats(string corpus, int maxK); 16 kgramstats(string corpus, int maxK);
17 map<string, int>* lookupExts(kgram tk);
18 int getMaxK();
19 vector<string> randomSentence(int n); 17 vector<string> randomSentence(int n);
20 18
21private: 19private:
20 typedef struct
21 {
22 int all;
23 int titlecase;
24 int uppercase;
25 int period;
26 } token_data;
22 int maxK; 27 int maxK;
23 map<kgram, map<string, int>* >* stats; 28 map<kgram, map<string, token_data*>* >* stats;
24}; 29};
25 30
26void printKgram(kgram k); 31void printKgram(kgram k);
32std::string canonize(std::string f);
27 33
28#endif \ No newline at end of file 34#endif \ No newline at end of file