about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--kgramstats.cpp70
-rw-r--r--kgramstats.h3
2 files changed, 39 insertions, 34 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index d196e8f..6c0e4ce 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -21,7 +21,7 @@ kgramstats::kgramstats(string corpus, int maxK)
21 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 21 start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
22 } 22 }
23 23
24 stats = new map<kgram, map<string, token_data*>* >(); 24 map<kgram, map<string, token_data*>* > tstats;
25 for (int k=0; k<=maxK; k++) 25 for (int k=0; k<=maxK; k++)
26 { 26 {
27 for (int i=0; i<(tokens.size() - k); i++) 27 for (int i=0; i<(tokens.size() - k); i++)
@@ -31,17 +31,18 @@ kgramstats::kgramstats(string corpus, int maxK)
31 string f = tokens[i+k]; 31 string f = tokens[i+k];
32 string canonical = canonize(f); 32 string canonical = canonize(f);
33 33
34 if ((*stats)[seq] == NULL) 34 if (tstats[seq] == NULL)
35 { 35 {
36 (*stats)[seq] = new map<string, token_data*>(); 36 tstats[seq] = new map<string, token_data*>();
37 } 37 }
38 38
39 if ((*(*stats)[seq])[canonical] == NULL) 39 if ((*tstats[seq])[canonical] == NULL)
40 { 40 {
41 (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data)); 41 (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
42 } 42 }
43 43
44 token_data* td = stats->at(seq)->at(canonical); 44 token_data* td = tstats[seq]->at(canonical);
45 td->token = new string(canonical);
45 td->all++; 46 td->all++;
46 47
47 if ((f.length() > 0) && (f[f.length()-1] == '.')) 48 if ((f.length() > 0) && (f[f.length()-1] == '.'))
@@ -58,6 +59,24 @@ kgramstats::kgramstats(string corpus, int maxK)
58 } 59 }
59 } 60 }
60 } 61 }
62
63 stats = new map<kgram, map<int, token_data*>* >();
64 for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
65 {
66 kgram klist = it->first;
67 map<string, token_data*>* probtable = it->second;
68 map<int, token_data*>* distribution = new map<int, token_data*>();
69 int max = 0;
70
71 for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
72 {
73 max += kt->second->all;
74
75 (*distribution)[max] = kt->second;
76 }
77
78 (*stats)[klist] = distribution;
79 }
61} 80}
62 81
63void printKgram(kgram k) 82void printKgram(kgram k)
@@ -89,38 +108,23 @@ vector<string> kgramstats::randomSentence(int n)
89 } 108 }
90 } 109 }
91 110
92 map<string, token_data*>* probtable = (*stats)[cur]; 111 map<int, token_data*> distribution = *(*stats)[cur];
93 int max = 0; 112 int max = distribution.rbegin()->first;
94 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) 113 int r = rand() % max;
95 { 114 token_data* next = distribution.upper_bound(r)->second;
96 max += it->second->all;
97 }
98
99 int r = rand() % (max+1);
100 map<string, token_data*>::iterator next = probtable->begin();
101 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it)
102 {
103 if (it->second->all > r)
104 {
105 break;
106 } else {
107 next = it;
108 r -= it->second->all;
109 }
110 }
111 115
112 string nextToken(next->first); 116 string nextToken(*(next->token));
113 int casing = rand() % next->second->all; 117 int casing = rand() % next->all;
114 int period = rand() % next->second->all; 118 int period = rand() % next->all;
115 if (casing < next->second->uppercase) 119 if (casing < next->uppercase)
116 { 120 {
117 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 121 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
118 } else if ((casing - next->second->uppercase) < next->second->titlecase) 122 } else if ((casing - next->uppercase) < next->titlecase)
119 { 123 {
120 nextToken[0] = toupper(nextToken[0]); 124 nextToken[0] = toupper(nextToken[0]);
121 } 125 }
122 126
123 if (period < next->second->period) 127 if (period < next->period)
124 { 128 {
125 nextToken += "."; 129 nextToken += ".";
126 } 130 }
@@ -136,9 +140,9 @@ vector<string> kgramstats::randomSentence(int n)
136 cout << *it << " "; 140 cout << *it << " ";
137 } 141 }
138 142
139 cout << "-> \"" << nextToken << "\" (" << next->second->all << "/" << max << ")" << endl; 143 cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
140 144
141 cur.push_back(next->first); 145 cur.push_back(*(next->token));
142 result.push_back(nextToken); 146 result.push_back(nextToken);
143 } 147 }
144 148
diff --git a/kgramstats.h b/kgramstats.h index 248b193..b40e1ab 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -23,9 +23,10 @@ private:
23 int titlecase; 23 int titlecase;
24 int uppercase; 24 int uppercase;
25 int period; 25 int period;
26 string* token;
26 } token_data; 27 } token_data;
27 int maxK; 28 int maxK;
28 map<kgram, map<string, token_data*>* >* stats; 29 map<kgram, map<int, token_data*>* >* stats;
29}; 30};
30 31
31void printKgram(kgram k); 32void printKgram(kgram k);