about summary refs log tree commit diff stats
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2015-07-19 22:19:49 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2015-07-19 22:19:49 -0400
commite080a02518103538897be3f09a342712bd42c546 (patch)
tree97fc06df67d71a4cb3cb8822a76eaf257a25f9db
parentb71552ea3f8237012fe5677385df1c3534405121 (diff)
downloadrawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.gz
rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.bz2
rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.zip
Kerjiggered the algorithms
-rw-r--r--Makefile.am3
-rw-r--r--kgramstats.cpp187
-rw-r--r--kgramstats.h5
3 files changed, 173 insertions, 22 deletions
diff --git a/Makefile.am b/Makefile.am index 299dc10..150ede2 100644 --- a/Makefile.am +++ b/Makefile.am
@@ -4,5 +4,6 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
4bin_PROGRAMS = rawr-ebooks rawr-gen 4bin_PROGRAMS = rawr-ebooks rawr-gen
5rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp 5rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp
6rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp 6rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp
7rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) 7rawr_gen_CPPFLAGS = -std=c++11
8rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -std=c++11
8rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file 9rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file
diff --git a/kgramstats.cpp b/kgramstats.cpp index 2321b11..1f3dd3c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -28,6 +28,8 @@ kgramstats::kgramstats(string corpus, int maxK)
28 } 28 }
29 29
30 map<kgram, map<string, token_data*>* > tstats; 30 map<kgram, map<string, token_data*>* > tstats;
31 bool newSentence = true;
32 bool newClause = false;
31 for (int k=0; k<=maxK; k++) 33 for (int k=0; k<=maxK; k++)
32 { 34 {
33 for (int i=0; i<(tokens.size() - k); i++) 35 for (int i=0; i<(tokens.size() - k); i++)
@@ -50,11 +52,83 @@ kgramstats::kgramstats(string corpus, int maxK)
50 token_data* td = tstats[seq]->at(canonical); 52 token_data* td = tstats[seq]->at(canonical);
51 td->token = new string(canonical); 53 td->token = new string(canonical);
52 td->all++; 54 td->all++;
55
56 if (newSentence)
57 {
58 kgram newKgram(1, ".");
59 if (tstats[newKgram] == NULL)
60 {
61 tstats[newKgram] = new map<string, token_data*>();
62 }
63
64 (*tstats[newKgram])[canonical] = td;
65
66 newSentence = false;
67 }
68
69 if (newClause)
70 {
71 kgram commaKgram(1, ",");
72 if (tstats[commaKgram] == NULL)
73 {
74 tstats[commaKgram] = new map<string, token_data*>();
75 }
76
77 (*tstats[commaKgram])[canonical] = td;
78
79 newClause = false;
80 }
53 81
54 if ((f.length() > 0) && (f[f.length()-1] == '.')) 82 if ((f.length() > 0) && (f[f.length()-1] == '.'))
55 { 83 {
56 td->period++; 84 td->period++;
85 newSentence = true;
57 } 86 }
87
88 if (f.length() > 0)
89 {
90 if (f[0] == '"')
91 {
92 td->startquote++;
93 }
94
95 if (f[f.length()-1] == '"')
96 {
97 td->endquote++;
98
99 if ((f.length() > 1) && (f[f.length()-2] == ','))
100 {
101 td->comma++;
102 newClause = true;
103 }
104 }
105
106 if (f[f.length()-1] == ',')
107 {
108 td->comma++;
109 newClause = true;
110
111 if ((f.length() > 1) && (f[f.length()-2] == '"'))
112 {
113 td->endquote++;
114 }
115
116 if ((f.length() > 1) && (f[f.length()-2] == ')'))
117 {
118 td->endparen++;
119 }
120 }
121
122 if (f[0] == '(')
123 {
124 td->startparen++;
125 }
126
127 if (f[f.length()-1] == ')')
128 {
129 td->endparen++;
130 }
131 }
58 132
59 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 133 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
60 { 134 {
@@ -97,22 +171,31 @@ void printKgram(kgram k)
97vector<string> kgramstats::randomSentence(int n) 171vector<string> kgramstats::randomSentence(int n)
98{ 172{
99 vector<string> result; 173 vector<string> result;
100 list<string> cur; 174 kgram newKgram(1, ".");
175 kgram commaKgram(1, ",");
176 list<string> cur = newKgram;
177 int cuts = 0;
101 178
102 for (int i=0; i<n; i++) 179 for (int i=0; i<n; i++)
103 { 180 {
104 if ((rand() % (maxK - cur.size() + 1)) == 0) 181 if ((cur.size() > 0) && (cur != newKgram))
105 { 182 {
106 for (int i=0; i<cur.size(); i++) 183 if (rand() % (maxK - cur.size() + 1) == 0)
107 { 184 {
108 if ((rand() % 3) == 0) 185 while (cur.size() > 0)
109 { 186 {
110 cur.pop_front(); 187 if ((rand() % (n)) < cuts)
111 } else { 188 {
112 break; 189 cur.pop_front();
113 } 190 cuts--;
114 } 191 } else {
115 } 192 break;
193 }
194 }
195 }
196
197 cuts++;
198 }
116 199
117 map<int, token_data*> distribution = *(*stats)[cur]; 200 map<int, token_data*> distribution = *(*stats)[cur];
118 int max = distribution.rbegin()->first; 201 int max = distribution.rbegin()->first;
@@ -122,6 +205,11 @@ vector<string> kgramstats::randomSentence(int n)
122 string nextToken(*(next->token)); 205 string nextToken(*(next->token));
123 int casing = rand() % next->all; 206 int casing = rand() % next->all;
124 int period = rand() % next->all; 207 int period = rand() % next->all;
208 int startparen = rand() % next->all;
209 int endparen = rand() % next->all;
210 int startquote = rand() % next->all;
211 int endquote = rand() % next->all;
212 int comma = rand() % next->all;
125 if (casing < next->uppercase) 213 if (casing < next->uppercase)
126 { 214 {
127 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper); 215 transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
@@ -129,11 +217,53 @@ vector<string> kgramstats::randomSentence(int n)
129 { 217 {
130 nextToken[0] = toupper(nextToken[0]); 218 nextToken[0] = toupper(nextToken[0]);
131 } 219 }
220
221 if ((cur == newKgram) && (rand() % 3 < 2))
222 {
223 nextToken[0] = toupper(nextToken[0]);
224 }
225
226 if (startquote < next->startquote)
227 {
228 nextToken = "\"" + nextToken;
229 } else if (startparen < next->startparen)
230 {
231 nextToken = "(" + nextToken;
232 }
132 233
133 if (period < next->period) 234 if (period < next->period)
134 { 235 {
135 nextToken += "."; 236 if (endquote < next->endquote)
136 } 237 {
238 nextToken += "\"";
239 } else if (endparen < next->endparen)
240 {
241 nextToken += ")";
242 }
243
244 int type = rand() % 6;
245
246 if (type < 3)
247 {
248 nextToken += ".";
249 } else if (type < 5)
250 {
251 nextToken += "!";
252 } else {
253 nextToken += "?";
254 }
255 } else if (comma < next->comma)
256 {
257 if (endquote < next->endquote)
258 {
259 nextToken += "\"";
260 } else if (endparen < next->endparen)
261 {
262 nextToken += ")";
263 }
264
265 nextToken += ",";
266 }
137 267
138 if (cur.size() == maxK) 268 if (cur.size() == maxK)
139 { 269 {
@@ -147,8 +277,22 @@ vector<string> kgramstats::randomSentence(int n)
147 } 277 }
148 278
149 cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl; 279 cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
280
281 if ((cur == newKgram) || (cur == commaKgram))
282 {
283 cur.pop_front();
284 }
285
286 if ((period < next->period) && ((rand() % 2) == 0))
287 {
288 cur = newKgram;
289 } else if ((comma < next->comma) && ((rand() % 3) == 0))
290 {
291 cur = commaKgram;
292 } else {
293 cur.push_back(*(next->token));
294 }
150 295
151 cur.push_back(*(next->token));
152 result.push_back(nextToken); 296 result.push_back(nextToken);
153 } 297 }
154 298
@@ -159,10 +303,11 @@ std::string canonize(std::string f)
159{ 303{
160 string canonical(f); 304 string canonical(f);
161 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower); 305 transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
162 if (canonical[canonical.length()-1] == '.') 306
163 { 307 string result;
164 canonical.resize(canonical.find('.')); 308 remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), [] (char c) {
165 } 309 return !((c != '.') && (c != '"') && (c != '(') && (c != ')') && (c != ','));
310 });
166 311
167 return canonical; 312 return result;
168} \ No newline at end of file 313} \ No newline at end of file
diff --git a/kgramstats.h b/kgramstats.h index b40e1ab..059eb05 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -23,6 +23,11 @@ private:
23 int titlecase; 23 int titlecase;
24 int uppercase; 24 int uppercase;
25 int period; 25 int period;
26 int startquote;
27 int endquote;
28 int startparen;
29 int endparen;
30 int comma;
26 string* token; 31 string* token;
27 } token_data; 32 } token_data;
28 int maxK; 33 int maxK;