diff options
author | Feffernoose <fefferburbia@gmail.com> | 2013-10-01 21:29:15 -0400 |
---|---|---|
committer | Feffernoose <fefferburbia@gmail.com> | 2013-10-01 21:29:15 -0400 |
commit | 420a7a1e004410f1377a6d919d72d18f8ae34bdf (patch) | |
tree | 33c0fc579e8f4e3d93757d886354309786941a13 | |
parent | 8de3134bf2cd26ff81359df703e5fbc6280448d7 (diff) | |
download | rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.gz rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.tar.bz2 rawr-ebooks-420a7a1e004410f1377a6d919d72d18f8ae34bdf.zip |
Weighed token casing and presence of periods
Tokens which differ only by casing or the presence of an ending period are now considered the same token. When tokens are generated, they are cased based on the prevalence of Upper/Title/Lower casing of the token in the input corpus, and similarly, a period is added to the end of the word based on how often the same token was ended with a period in the input corpus.
-rw-r--r-- | kgramstats.cpp | 92 | ||||
-rw-r--r-- | kgramstats.h | 12 |
2 files changed, 76 insertions, 28 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index 142b5aa..708013f 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <vector> | 2 | #include <vector> |
3 | #include <iostream> | 3 | #include <iostream> |
4 | #include <cstdlib> | 4 | #include <cstdlib> |
5 | #include <algorithm> | ||
5 | 6 | ||
6 | kgramstats::kgramstats(string corpus, int maxK) | 7 | kgramstats::kgramstats(string corpus, int maxK) |
7 | { | 8 | { |
@@ -20,34 +21,45 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
20 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | 21 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); |
21 | } | 22 | } |
22 | 23 | ||
23 | stats = new map<kgram, map<string, int>* >(); | 24 | stats = new map<kgram, map<string, token_data*>* >(); |
24 | for (int k=0; k<=maxK; k++) | 25 | for (int k=0; k<=maxK; k++) |
25 | { | 26 | { |
26 | for (int i=0; i<(tokens.size() - k); i++) | 27 | for (int i=0; i<(tokens.size() - k); i++) |
27 | { | 28 | { |
28 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | 29 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); |
30 | transform(seq.begin(), seq.end(), seq.begin(), canonize); | ||
29 | string f = tokens[i+k]; | 31 | string f = tokens[i+k]; |
32 | string canonical = canonize(f); | ||
30 | 33 | ||
31 | if ((*stats)[seq] == NULL) | 34 | if ((*stats)[seq] == NULL) |
32 | { | 35 | { |
33 | (*stats)[seq] = new map<string, int>(); | 36 | (*stats)[seq] = new map<string, token_data*>(); |
34 | } | 37 | } |
35 | 38 | ||
36 | (*((*stats)[seq]))[f]++; | 39 | if ((*(*stats)[seq])[canonical] == NULL) |
40 | { | ||
41 | (*(*stats)[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data)); | ||
42 | } | ||
43 | |||
44 | token_data* td = stats->at(seq)->at(canonical); | ||
45 | td->all++; | ||
46 | |||
47 | if ((f.length() > 0) && (f[f.length()-1] == '.')) | ||
48 | { | ||
49 | td->period++; | ||
50 | } | ||
51 | |||
52 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | ||
53 | { | ||
54 | td->uppercase++; | ||
55 | } else if (isupper(f[0])) | ||
56 | { | ||
57 | td->titlecase++; | ||
58 | } | ||
37 | } | 59 | } |
38 | } | 60 | } |
39 | } | 61 | } |
40 | 62 | ||
41 | map<string, int>* kgramstats::lookupExts(kgram tk) | ||
42 | { | ||
43 | return (*stats)[tk]; | ||
44 | } | ||
45 | |||
46 | int kgramstats::getMaxK() | ||
47 | { | ||
48 | return maxK; | ||
49 | } | ||
50 | |||
51 | void printKgram(kgram k) | 63 | void printKgram(kgram k) |
52 | { | 64 | { |
53 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | 65 | for (kgram::iterator it = k.begin(); it != k.end(); it++) |
@@ -76,35 +88,65 @@ vector<string> kgramstats::randomSentence(int n) | |||
76 | } | 88 | } |
77 | } | 89 | } |
78 | } | 90 | } |
79 | 91 | ||
80 | map<string, int>* probtable = lookupExts(cur); | 92 | map<string, token_data*>* probtable = (*stats)[cur]; |
81 | int max = 0; | 93 | int max = 0; |
82 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 94 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
83 | { | 95 | { |
84 | max += it->second; | 96 | max += it->second->all; |
85 | } | 97 | } |
86 | 98 | ||
87 | int r = rand() % (max+1); | 99 | int r = rand() % (max+1); |
88 | string next = probtable->begin()->first; | 100 | map<string, token_data*>::iterator next = probtable->begin(); |
89 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | 101 | for (map<string, token_data*>::iterator it = probtable->begin(); it != probtable->end(); ++it) |
90 | { | 102 | { |
91 | if (it->second > r) | 103 | if (it->second->all > r) |
92 | { | 104 | { |
93 | break; | 105 | break; |
94 | } else { | 106 | } else { |
95 | next = it->first; | 107 | next = it; |
96 | r -= it->second; | 108 | r -= it->second->all; |
97 | } | 109 | } |
98 | } | 110 | } |
99 | 111 | ||
112 | string nextToken(next->first); | ||
113 | int casing = rand() % next->second->all; | ||
114 | int period = rand() % next->second->all; | ||
115 | if (casing < next->second->uppercase) | ||
116 | { | ||
117 | transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); | ||
118 | } else if ((casing - next->second->uppercase) < next->second->titlecase) | ||
119 | { | ||
120 | nextToken[0] = toupper(nextToken[0]); | ||
121 | } | ||
122 | |||
123 | if (period < next->second->period) | ||
124 | { | ||
125 | nextToken += "."; | ||
126 | } | ||
127 | |||
128 | cout << next->first << " | " << nextToken << endl; | ||
129 | |||
100 | if (cur.size() == maxK) | 130 | if (cur.size() == maxK) |
101 | { | 131 | { |
102 | cur.pop_front(); | 132 | cur.pop_front(); |
103 | } | 133 | } |
104 | 134 | ||
105 | cur.push_back(next); | 135 | cur.push_back(next->first); |
106 | result.push_back(next); | 136 | result.push_back(nextToken); |
107 | } | 137 | } |
108 | 138 | ||
109 | return result; | 139 | return result; |
140 | } | ||
141 | |||
142 | std::string canonize(std::string f) | ||
143 | { | ||
144 | string canonical(f); | ||
145 | transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); | ||
146 | if (canonical[canonical.length()-1] == '.') | ||
147 | { | ||
148 | canonical.resize(canonical.find('.')); | ||
149 | } | ||
150 | |||
151 | return canonical; | ||
110 | } \ No newline at end of file | 152 | } \ No newline at end of file |
diff --git a/kgramstats.h b/kgramstats.h index 069bb90..248b193 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -14,15 +14,21 @@ class kgramstats | |||
14 | { | 14 | { |
15 | public: | 15 | public: |
16 | kgramstats(string corpus, int maxK); | 16 | kgramstats(string corpus, int maxK); |
17 | map<string, int>* lookupExts(kgram tk); | ||
18 | int getMaxK(); | ||
19 | vector<string> randomSentence(int n); | 17 | vector<string> randomSentence(int n); |
20 | 18 | ||
21 | private: | 19 | private: |
20 | typedef struct | ||
21 | { | ||
22 | int all; | ||
23 | int titlecase; | ||
24 | int uppercase; | ||
25 | int period; | ||
26 | } token_data; | ||
22 | int maxK; | 27 | int maxK; |
23 | map<kgram, map<string, int>* >* stats; | 28 | map<kgram, map<string, token_data*>* >* stats; |
24 | }; | 29 | }; |
25 | 30 | ||
26 | void printKgram(kgram k); | 31 | void printKgram(kgram k); |
32 | std::string canonize(std::string f); | ||
27 | 33 | ||
28 | #endif \ No newline at end of file | 34 | #endif \ No newline at end of file |