about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorFeffernoose <fefferburbia@gmail.com>2013-10-01 21:29:15 -0400
committerFeffernoose <fefferburbia@gmail.com>2013-10-01 21:29:15 -0400
commit420a7a1e004410f1377a6d919d72d18f8ae34bdf (patch)
tree33c0fc579e8f4e3d93757d886354309786941a13
parent8de3134bf2cd26ff81359df703e5fbc6280448d7 (diff)
downloadrawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.gz
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.bz2
rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.zip
Weighed token casing and presence of periods
Tokens which differ only by casing or the presence of an ending period are
now considered the same token. When tokens are generated, they are cased
based on the prevalence of Upper/Title/Lower casing of the token in the
input corpus, and similarly, a period is added to the end of the word based
on how often the same token was ended with a period in the input corpus.
-rw-r--r--kgramstats.cpp92
-rw-r--r--kgramstats.h12
2 files changed, 76 insertions, 28 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 142b5aa..708013f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -2,6 +2,7 @@
2#include <vector> 2#include <vector>
3#include <iostream> 3#include <iostream>
4#include <cstdlib> 4#include <cstdlib>
5#include <algorithm>
5 6
6kgramstats::kgramstats(string corpus, int maxK) 7kgramstats::kgramstats(string corpus, int maxK)
7{ 8{
@@ -20,34 +21,45 @@ kgramstats::kgramstats(string corpus, int maxK)
20 start = ((end > (string::npos - 1) ) ? string::npos : end + 1); 21 start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
21 } 22 }
22 23
23 stats = new map<kgram, map<string, int>* >(); 24 stats = new map<kgram, map<string, token_data*>* >();
24 for (int k=0; k<=maxK; k++) 25 for (int k=0; k<=maxK; k++)
25 { 26 {
26 for (int i=0; i<(tokens.size() - k); i++) 27 for (int i=0; i<(tokens.size() - k); i++)
27 { 28 {
28 kgram seq(tokens.begin()+i, tokens.begin()+i+k); 29 kgram seq(tokens.begin()+i, tokens.begin()+i+k);
30 transform(seq.begin(), seq.end(), seq.begin(), canonize);
29 string f = tokens[i+k]; 31 string f = tokens[i+k];
32 string canonical = canonize(f);
30 33
31 if ((*stats)[seq] == NULL) 34 if ((*stats)[seq] == NULL)
32 { 35 {
33 (*stats)[seq] = new map<string, int>(); 36 (*stats)[seq] = new map<string, token_data*>();
34 } 37 }
35 38
36 (*((*stats)[seq]))[f]++; 39 if ((*(*stats)[seq])[canonical] == NULL)
40 {
41 (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
42 }
43
44 token_data* td = stats->at(seq)->at(canonical);
45 td->all++;
46
47 if ((f.length() > 0) && (f[f.length()-1] == '.'))
48 {
49 td->period++;
50 }
51
52 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
53 {
54 td->uppercase++;
55 } else if (isupper(f[0]))
56 {
57 td->titlecase++;
58 }
37 } 59 }
38 } 60 }
39} 61}
40 62
41map<string, int>* kgramstats::lookupExts(kgram tk)
42{
43 return (*stats)[tk];
44}
45
46int kgramstats::getMaxK()
47{
48 return maxK;
49}
50
51void printKgram(kgram k) 63void printKgram(kgram k)
52{ 64{
53 for (kgram::iterator it = k.begin(); it != k.end(); it++) 65 for (kgram::iterator it = k.begin(); it != k.end(); it++)
@@ -76,35 +88,65 @@ vector<string> kgramstats::randomSentence(int n)
76 } 88 }
77 } 89 }
78 } 90 }
79 91
80 map<string, int>* probtable = lookupExts(cur); 92 map<string, token_data*>* probtable = (*stats)[cur];
81 int max = 0; 93 int max = 0;
82 for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) 94 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it)
83 { 95 {
84 max += it->second; 96 max += it->second->all;
85 } 97 }
86 98
87 int r = rand() % (max+1); 99 int r = rand() % (max+1);
88 string next = probtable->begin()->first; 100 map<string, token_data*>::iterator next = probtable->begin();
89 for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) 101 for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it)
90 { 102 {
91 if (it->second > r) 103 if (it->second->all > r)
92 { 104 {
93 break; 105 break;
94 } else { 106 } else {
95 next = it->first; 107 next = it;
96 r -= it->second; 108 r -= it->second->all;
97 } 109 }
98 } 110 }
99 111
112 string nextToken(next->first);
113 int casing = rand() % next->second->all;
114 int period = rand() % next->second->all;
115 if (casing < next->second->uppercase)
116 {
117 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
118 } else if ((casing - next->second->uppercase) < next->second->titlecase)
119 {
120 nextToken[0] = toupper(nextToken[0]);
121 }
122
123 if (period < next->second->period)
124 {
125 nextToken += ".";
126 }
127
128 cout << next->first << " | " << nextToken << endl;
129
100 if (cur.size() == maxK) 130 if (cur.size() == maxK)
101 { 131 {
102 cur.pop_front(); 132 cur.pop_front();
103 } 133 }
104 134
105 cur.push_back(next); 135 cur.push_back(next->first);
106 result.push_back(next); 136 result.push_back(nextToken);
107 } 137 }
108 138
109 return result; 139 return result;
140}
141
142std::string canonize(std::string f)
143{
144 string canonical(f);
145 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
146 if (canonical[canonical.length()-1] == '.')
147 {
148 canonical.resize(canonical.find('.'));
149 }
150
151 return canonical;
110} \ No newline at end of file 152} \ No newline at end of file
diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -14,15 +14,21 @@ class kgramstats
14{ 14{
15public: 15public:
16 kgramstats(string corpus, int maxK); 16 kgramstats(string corpus, int maxK);
17 map<string, int>* lookupExts(kgram tk);
18 int getMaxK();
19 vector<string> randomSentence(int n); 17 vector<string> randomSentence(int n);
20 18
21private: 19private:
20 typedef struct
21 {
22 int all;
23 int titlecase;
24 int uppercase;
25 int period;
26 } token_data;
22 int maxK; 27 int maxK;
23 map<kgram, map<string, int>* >* stats; 28 map<kgram, map<string, token_data*>* >* stats;
24}; 29};
25 30
26void printKgram(kgram k); 31void printKgram(kgram k);
32std::string canonize(std::string f);
27 33
28#endif \ No newline at end of file 34#endif \ No newline at end of file