1 files changed, 93 insertions, 68 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b4e68eb..17598de 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -3,31 +3,35 @@
 #include <iostream>
 #include <cstdlib>
 #include <algorithm>
+#include "malaprop.h"
+std::string canonize(std::string f);
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
-kgramstats::kgramstats(string corpus, int maxK)
+kgramstats::kgramstats(std::string corpus, int maxK)
 {
        this->maxK = maxK;
-        
+  
-        vector<string> tokens;
+  std::vector<std::string> tokens;
-    int start = 0;
+    size_t start = 0;
        int end = 0;
-        while (end != string::npos)
+        while (end != std::string::npos)
        {
           end = corpus.find(" ", start);
-       string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start);
+       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
       if (token.compare(""))
       {
+         mstats.addWord(token);
           tokens.push_back(token);
       }
-           start = ((end > (string::npos - 1) ) ? string::npos : end + 1);
+           start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
        }
        
-        map<kgram, map<string, token_data*>* > tstats;
+        std::map<kgram, std::map<std::string, token_data*>* > tstats;
  bool newSentence = true;
  bool newClause = false;
        for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
                for (int i=0; i<(tokens.size() - k); i++)
                {
                        kgram seq(tokens.begin()+i, tokens.begin()+i+k);
-                        transform(seq.begin(), seq.end(), seq.begin(), canonize);
+                        std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
-                        string f = tokens[i+k];
+                        std::string f = tokens[i+k];
-                        string canonical = canonize(f);
+                        std::string canonical = canonize(f);
                        
                        if (tstats[seq] == NULL)
                        {
-                                tstats[seq] = new map<string, token_data*>();
+                                tstats[seq] = new std::map<std::string, token_data*>();
                        }
                        
                        if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
                        }
                        token_data* td = tstats[seq]->at(canonical);
-                        td->token = new string(canonical);
+                        td->token = new std::string(canonical);
                        td->all++;
      
      if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
        kgram newKgram(1, ".");
        if (tstats[newKgram] == NULL)
        {
-          tstats[newKgram] = new map<string, token_data*>();
+          tstats[newKgram] = new std::map<std::string, token_data*>();
        }
        
        (*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
        kgram commaKgram(1, ",");
        if (tstats[commaKgram] == NULL)
        {
-          tstats[commaKgram] = new map<string, token_data*>();
+          tstats[commaKgram] = new std::map<std::string, token_data*>();
        }
        
        (*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
                }
        }
        
-        stats = new map<kgram, map<int, token_data*>* >();
+        stats = new std::map<kgram, std::map<int, token_data*>* >();
-        for (map<kgram, map<string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+        for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
        {
                kgram klist = it->first;
-                map<string, token_data*>* probtable = it->second;
+                std::map<std::string, token_data*>* probtable = it->second;
-                map<int, token_data*>* distribution = new map<int, token_data*>();
+                std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
        int max = 0;
                
-                for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+                for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
                {
                        max += kt->second->all;
                        
@@ -187,17 +191,17 @@ void printKgram(kgram k)
 {
        for (kgram::iterator it = k.begin(); it != k.end(); it++)
        {
-                cout << *it << " ";
+                std::cout << *it << " ";
        }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
-vector<string> kgramstats::randomSentence(int n)
+std::vector<std::string> kgramstats::randomSentence(int n)
 {
-        vector<string> result;
+        std::vector<std::string> result;
  kgram newKgram(1, ".");
  kgram commaKgram(1, ",");
-        list<string> cur = newKgram;
+        std::list<std::string> cur = newKgram;
  int cuts = 0;
        
        for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
      cuts++;
    }
-                map<int, token_data*> distribution = *(*stats)[cur];
+                std::map<int, token_data*> distribution = *(*stats)[cur];
                int max = distribution.rbegin()->first;
                int r = rand() % max;
                token_data* next = distribution.upper_bound(r)->second;
-                string nextToken(*(next->token));
+                std::string nextToken(*(next->token));
                int casing = rand() % next->all;
                int period = rand() % next->all;
    int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
    int comma = rand() % next->all;
                if (casing < next->uppercase)
                {
-                        transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+                        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
                } else if ((casing - next->uppercase) < next->titlecase)
                {
                        nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
    {
      nextToken[0] = toupper(nextToken[0]);
    }
-    /*
+    
-    if (startquote < next->startquote)
+    bool mess = (rand() % 100) == 0;
-    {
+    if (mess)
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
    {
-      nextToken = "(" + nextToken;
+      nextToken = mstats.alternate(nextToken);
-    }
-                
+      if (startquote < next->startquote)
-                if (period < next->period)
-                {
-      if (endquote < next->endquote)
      {
-        nextToken += "\"";
+        nextToken = "\"" + nextToken;
-      } else if (endparen < next->endparen)
+      } else if (startparen < next->startparen)
      {
-        nextToken += ")";
+        nextToken = "(" + nextToken;
      }
+                
+                if (period < next->period)
+                {
+        if (endquote < next->endquote)
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
      
-      int type = rand() % 6;
+        int type = rand() % 6;
      
-      if (type < 3)
+        if (type < 3)
-      {
+        {
-        nextToken += ".";
+          nextToken += ".";
-      } else if (type < 5)
+        } else if (type < 5)
-      {
+        {
-        nextToken += "!";
+          nextToken += "!";
-      } else {
+        } else {
-        nextToken += "?";
+          nextToken += "?";
-      }
+        }
-                } else if (comma < next->comma)
+                } else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
      {
-        nextToken += ")";
+        if (endquote < next->endquote)
-      }
+        {
+          nextToken += "\"";
+        } else if (endparen < next->endparen)
+        {
+          nextToken += ")";
+        }
      
-      nextToken += ",";
+        nextToken += ",";
+      }
    }
-*/
+    
                if (cur.size() == maxK)
                {
                        cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
                /* DEBUG */
                for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
                {
-                        cout << *it << " ";
+                        std::cout << *it << " ";
                }
                
-                cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
+                std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    
+    if (mess)
+    {
+      std::cout << " mala " << *(next->token);
+    }
+    
+    std::cout << std::endl;
    
    if ((cur == newKgram) || (cur == commaKgram))
    {
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
    {
      cur = commaKgram;
    } else {
-      cur.push_back(*(next->token));
+      //if (mess && (rand() % 2 == 0))
+      if (false)
+      {
+        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
+        cur.clear();
+        cur.push_back(nextToken);
+      } else {
+        cur.push_back(*(next->token));
+      }
    }
                
                result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
 std::string canonize(std::string f)
 {
-        string canonical(f);
+        std::string canonical(f);
-        transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
-  string result;
+  std::string result;
-  remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
+  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
        
        return canonical;
 }

diff --git a/kgramstats.cpp b/kgramstats.cpp index b4e68eb..17598de 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -3,31 +3,35 @@
3	#include <iostream>	3	#include <iostream>
4	#include <cstdlib>	4	#include <cstdlib>
5	#include <algorithm>	5	#include <algorithm>
		6	#include "malaprop.h"
		7
		8	std::string canonize(std::string f);
6		9
7	// runs in O(t^2) time where t is the number of tokens in the input corpus	10	// runs in O(t^2) time where t is the number of tokens in the input corpus
8	// We consider maxK to be fairly constant	11	// We consider maxK to be fairly constant
9	kgramstats::kgramstats(string corpus, int maxK)	12	kgramstats::kgramstats(std::string corpus, int maxK)
10	{	13	{
11	this->maxK = maxK;	14	this->maxK = maxK;
12		15
13	vector<string> tokens;	16	std::vector<std::string> tokens;
14	int start = 0;	17	size_t start = 0;
15	int end = 0;	18	int end = 0;
16		19
17	while (end != string::npos)	20	while (end != std::string::npos)
18	{	21	{
19	end = corpus.find(" ", start);	22	end = corpus.find(" ", start);
20		23
21	string token = corpus.substr(start, (end == string::npos) ? string::npos : end - start);	24	std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
22	if (token.compare(""))	25	if (token.compare(""))
23	{	26	{
		27	mstats.addWord(token);
24	tokens.push_back(token);	28	tokens.push_back(token);
25	}	29	}
26		30
27	start = ((end > (string::npos - 1) ) ? string::npos : end + 1);	31	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
28	}	32	}
29		33
30	map<kgram, map<string, token_data> > tstats;	34	std::map<kgram, std::map<std::string, token_data> > tstats;
31	bool newSentence = true;	35	bool newSentence = true;
32	bool newClause = false;	36	bool newClause = false;
33	for (int k=0; k<=maxK; k++)	37	for (int k=0; k<=maxK; k++)
@@ -35,13 +39,13 @@ kgramstats::kgramstats(string corpus, int maxK)
35	for (int i=0; i<(tokens.size() - k); i++)	39	for (int i=0; i<(tokens.size() - k); i++)
36	{	40	{
37	kgram seq(tokens.begin()+i, tokens.begin()+i+k);	41	kgram seq(tokens.begin()+i, tokens.begin()+i+k);
38	transform(seq.begin(), seq.end(), seq.begin(), canonize);	42	std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
39	string f = tokens[i+k];	43	std::string f = tokens[i+k];
40	string canonical = canonize(f);	44	std::string canonical = canonize(f);
41		45
42	if (tstats[seq] == NULL)	46	if (tstats[seq] == NULL)
43	{	47	{
44	tstats[seq] = new map<string, token_data*>();	48	tstats[seq] = new std::map<std::string, token_data*>();
45	}	49	}
46		50
47	if ((*tstats[seq])[canonical] == NULL)	51	if ((*tstats[seq])[canonical] == NULL)
@@ -50,7 +54,7 @@ kgramstats::kgramstats(string corpus, int maxK)
50	}	54	}
51		55
52	token_data* td = tstats[seq]->at(canonical);	56	token_data* td = tstats[seq]->at(canonical);
53	td->token = new string(canonical);	57	td->token = new std::string(canonical);
54	td->all++;	58	td->all++;
55		59
56	if (newSentence)	60	if (newSentence)
@@ -58,7 +62,7 @@ kgramstats::kgramstats(string corpus, int maxK)
58	kgram newKgram(1, ".");	62	kgram newKgram(1, ".");
59	if (tstats[newKgram] == NULL)	63	if (tstats[newKgram] == NULL)
60	{	64	{
61	tstats[newKgram] = new map<string, token_data*>();	65	tstats[newKgram] = new std::map<std::string, token_data*>();
62	}	66	}
63		67
64	(*tstats[newKgram])[canonical] = td;	68	(*tstats[newKgram])[canonical] = td;
@@ -71,7 +75,7 @@ kgramstats::kgramstats(string corpus, int maxK)
71	kgram commaKgram(1, ",");	75	kgram commaKgram(1, ",");
72	if (tstats[commaKgram] == NULL)	76	if (tstats[commaKgram] == NULL)
73	{	77	{
74	tstats[commaKgram] = new map<string, token_data*>();	78	tstats[commaKgram] = new std::map<std::string, token_data*>();
75	}	79	}
76		80
77	(*tstats[commaKgram])[canonical] = td;	81	(*tstats[commaKgram])[canonical] = td;
@@ -164,15 +168,15 @@ kgramstats::kgramstats(string corpus, int maxK)
164	}	168	}
165	}	169	}
166		170
167	stats = new map<kgram, map<int, token_data> >();	171	stats = new std::map<kgram, std::map<int, token_data> >();
168	for (map<kgram, map<string, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)	172	for (std::map<kgram, std::map<std::string, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
169	{	173	{
170	kgram klist = it->first;	174	kgram klist = it->first;
171	map<string, token_data> probtable = it->second;	175	std::map<std::string, token_data> probtable = it->second;
172	map<int, token_data> distribution = new map<int, token_data*>();	176	std::map<int, token_data> distribution = new std::map<int, token_data*>();
173	int max = 0;	177	int max = 0;
174		178
175	for (map<string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)	179	for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
176	{	180	{
177	max += kt->second->all;	181	max += kt->second->all;
178		182
@@ -187,17 +191,17 @@ void printKgram(kgram k)
187	{	191	{
188	for (kgram::iterator it = k.begin(); it != k.end(); it++)	192	for (kgram::iterator it = k.begin(); it != k.end(); it++)
189	{	193	{
190	cout << *it << " ";	194	std::cout << *it << " ";
191	}	195	}
192	}	196	}
193		197
194	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus	198	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
195	vector<string> kgramstats::randomSentence(int n)	199	std::vector<std::string> kgramstats::randomSentence(int n)
196	{	200	{
197	vector<string> result;	201	std::vector<std::string> result;
198	kgram newKgram(1, ".");	202	kgram newKgram(1, ".");
199	kgram commaKgram(1, ",");	203	kgram commaKgram(1, ",");
200	list<string> cur = newKgram;	204	std::list<std::string> cur = newKgram;
201	int cuts = 0;	205	int cuts = 0;
202		206
203	for (int i=0; i<n; i++)	207	for (int i=0; i<n; i++)
@@ -221,12 +225,12 @@ vector<string> kgramstats::randomSentence(int n)
221	cuts++;	225	cuts++;
222	}	226	}
223		227
224	map<int, token_data> distribution = (*stats)[cur];	228	std::map<int, token_data> distribution = (*stats)[cur];
225	int max = distribution.rbegin()->first;	229	int max = distribution.rbegin()->first;
226	int r = rand() % max;	230	int r = rand() % max;
227	token_data* next = distribution.upper_bound(r)->second;	231	token_data* next = distribution.upper_bound(r)->second;
228		232
229	string nextToken(*(next->token));	233	std::string nextToken(*(next->token));
230	int casing = rand() % next->all;	234	int casing = rand() % next->all;
231	int period = rand() % next->all;	235	int period = rand() % next->all;
232	int startparen = rand() % next->all;	236	int startparen = rand() % next->all;
@@ -236,7 +240,7 @@ vector<string> kgramstats::randomSentence(int n)
236	int comma = rand() % next->all;	240	int comma = rand() % next->all;
237	if (casing < next->uppercase)	241	if (casing < next->uppercase)
238	{	242	{
239	transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	243	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
240	} else if ((casing - next->uppercase) < next->titlecase)	244	} else if ((casing - next->uppercase) < next->titlecase)
241	{	245	{
242	nextToken[0] = toupper(nextToken[0]);	246	nextToken[0] = toupper(nextToken[0]);
@@ -246,49 +250,55 @@ vector<string> kgramstats::randomSentence(int n)
246	{	250	{
247	nextToken[0] = toupper(nextToken[0]);	251	nextToken[0] = toupper(nextToken[0]);
248	}	252	}
249	/*	253
250	if (startquote < next->startquote)	254	bool mess = (rand() % 100) == 0;
251	{	255	if (mess)
252	nextToken = "\"" + nextToken;
253	} else if (startparen < next->startparen)
254	{	256	{
255	nextToken = "(" + nextToken;	257	nextToken = mstats.alternate(nextToken);
256	}	258
257		259	if (startquote < next->startquote)
258	if (period < next->period)
259	{
260	if (endquote < next->endquote)
261	{	260	{
262	nextToken += "\"";	261	nextToken = "\"" + nextToken;
263	} else if (endparen < next->endparen)	262	} else if (startparen < next->startparen)
264	{	263	{
265	nextToken += ")";	264	nextToken = "(" + nextToken;
266	}	265	}
		266
		267	if (period < next->period)
		268	{
		269	if (endquote < next->endquote)
		270	{
		271	nextToken += "\"";
		272	} else if (endparen < next->endparen)
		273	{
		274	nextToken += ")";
		275	}
267		276
268	int type = rand() % 6;	277	int type = rand() % 6;
269		278
270	if (type < 3)	279	if (type < 3)
271	{	280	{
272	nextToken += ".";	281	nextToken += ".";
273	} else if (type < 5)	282	} else if (type < 5)
274	{	283	{
275	nextToken += "!";	284	nextToken += "!";
276	} else {	285	} else {
277	nextToken += "?";	286	nextToken += "?";
278	}	287	}
279	} else if (comma < next->comma)	288	} else if (comma < next->comma)
280	{
281	if (endquote < next->endquote)
282	{
283	nextToken += "\"";
284	} else if (endparen < next->endparen)
285	{	289	{
286	nextToken += ")";	290	if (endquote < next->endquote)
287	}	291	{
		292	nextToken += "\"";
		293	} else if (endparen < next->endparen)
		294	{
		295	nextToken += ")";
		296	}
288		297
289	nextToken += ",";	298	nextToken += ",";
		299	}
290	}	300	}
291	*/	301
292	if (cur.size() == maxK)	302	if (cur.size() == maxK)
293	{	303	{
294	cur.pop_front();	304	cur.pop_front();
@@ -297,10 +307,17 @@ vector<string> kgramstats::randomSentence(int n)
297	/* DEBUG */	307	/* DEBUG */
298	for (kgram::iterator it = cur.begin(); it != cur.end(); it++)	308	for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
299	{	309	{
300	cout << *it << " ";	310	std::cout << *it << " ";
301	}	311	}
302		312
303	cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;	313	std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
		314
		315	if (mess)
		316	{
		317	std::cout << " mala " << *(next->token);
		318	}
		319
		320	std::cout << std::endl;
304		321
305	if ((cur == newKgram) \|\| (cur == commaKgram))	322	if ((cur == newKgram) \|\| (cur == commaKgram))
306	{	323	{
@@ -314,7 +331,15 @@ vector<string> kgramstats::randomSentence(int n)
314	{	331	{
315	cur = commaKgram;	332	cur = commaKgram;
316	} else {	333	} else {
317	cur.push_back(*(next->token));	334	//if (mess && (rand() % 2 == 0))
		335	if (false)
		336	{
		337	// This doesn't work because sometimes the alternate token isn't actually present in the original corpus
		338	cur.clear();
		339	cur.push_back(nextToken);
		340	} else {
		341	cur.push_back(*(next->token));
		342	}
318	}	343	}
319		344
320	result.push_back(nextToken);	345	result.push_back(nextToken);
@@ -330,11 +355,11 @@ bool removeIf(char c)
330		355
331	std::string canonize(std::string f)	356	std::string canonize(std::string f)
332	{	357	{
333	string canonical(f);	358	std::string canonical(f);
334	transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);	359	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
335		360
336	string result;	361	std::string result;
337	remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);	362	std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
338		363
339	return canonical;	364	return canonical;
340	}	365	}