Kerjiggered the algorithms

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2015-07-19 22:19:49 -0400
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2015-07-19 22:19:49 -0400
commit: e080a02518103538897be3f09a342712bd42c546 (patch)
tree: 97fc06df67d71a4cb3cb8822a76eaf257a25f9db
parent: b71552ea3f8237012fe5677385df1c3534405121 (diff)
download: rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.gz
rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.bz2
rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.zip
3 files changed, 173 insertions, 22 deletions
diff --git a/Makefile.am b/Makefile.am
index 299dc10..150ede2 100644
--- a/Makefile.am
+++ b/Makefile.am

@@ -4,5 +4,6 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
 bin_PROGRAMS = rawr-ebooks rawr-gen
 rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp
 rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp
-rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS)
+rawr_gen_CPPFLAGS = -std=c++11
+rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -std=c++11
 rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS)
 \ No newline at end of file
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 2321b11..1f3dd3c 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -28,6 +28,8 @@ kgramstats::kgramstats(string corpus, int maxK)
        }
        
        map<kgram, map<string, token_data*>* > tstats;
+  bool newSentence = true;
+  bool newClause = false;
        for (int k=0; k<=maxK; k++)
        {
                for (int i=0; i<(tokens.size() - k); i++)
@@ -50,11 +52,83 @@ kgramstats::kgramstats(string corpus, int maxK)
                        token_data* td = tstats[seq]->at(canonical);
                        td->token = new string(canonical);
                        td->all++;
+      
+      if (newSentence)
+      {
+        kgram newKgram(1, ".");
+        if (tstats[newKgram] == NULL)
+        {
+          tstats[newKgram] = new map<string, token_data*>();
+        }
+        
+        (*tstats[newKgram])[canonical] = td;
+        
+        newSentence = false;
+      }
+      
+      if (newClause)
+      {
+        kgram commaKgram(1, ",");
+        if (tstats[commaKgram] == NULL)
+        {
+          tstats[commaKgram] = new map<string, token_data*>();
+        }
+        
+        (*tstats[commaKgram])[canonical] = td;
+        
+        newClause = false;
+      }
                        
                        if ((f.length() > 0) && (f[f.length()-1] == '.'))
                        {
                                td->period++;
+        newSentence = true;
                        }
+      
+      if (f.length() > 0)
+      {
+        if (f[0] == '"')
+        {
+          td->startquote++;
+        }
+        
+        if (f[f.length()-1] == '"')
+        {
+          td->endquote++;
+          
+          if ((f.length() > 1) && (f[f.length()-2] == ','))
+          {
+            td->comma++;
+            newClause = true;
+          }
+        }
+        
+        if (f[f.length()-1] == ',')
+        {
+          td->comma++;
+          newClause = true;
+          
+          if ((f.length() > 1) && (f[f.length()-2] == '"'))
+          {
+            td->endquote++;
+          }
+          
+          if ((f.length() > 1) && (f[f.length()-2] == ')'))
+          {
+            td->endparen++;
+          }
+        }
+        
+        if (f[0] == '(')
+        {
+          td->startparen++;
+        }
+        
+        if (f[f.length()-1] == ')')
+        {
+          td->endparen++;
+        }
+      }
                        
                        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
                        {
@@ -97,22 +171,31 @@ void printKgram(kgram k)
 vector<string> kgramstats::randomSentence(int n)
 {
        vector<string> result;
-        list<string> cur;
+  kgram newKgram(1, ".");
+  kgram commaKgram(1, ",");
+        list<string> cur = newKgram;
+  int cuts = 0;
        
        for (int i=0; i<n; i++)
        {
-                if ((rand() % (maxK - cur.size() + 1)) == 0)
+    if ((cur.size() > 0) && (cur != newKgram))
-                {
+    {
-                        for (int i=0; i<cur.size(); i++)
+      if (rand() % (maxK - cur.size() + 1) == 0)
-                        {
+      {
-                                if ((rand() % 3) == 0)
+        while (cur.size() > 0)
-                                {
+        {
-                                        cur.pop_front();
+          if ((rand() % (n)) < cuts)
-                                } else {
+          {
-                                        break;
+            cur.pop_front();
-                                }
+            cuts--;
-                        }
+          } else {
-                }
+            break;
+          }
+        }
+      }
+      
+      cuts++;
+    }
                map<int, token_data*> distribution = *(*stats)[cur];
                int max = distribution.rbegin()->first;
@@ -122,6 +205,11 @@ vector<string> kgramstats::randomSentence(int n)
                string nextToken(*(next->token));
                int casing = rand() % next->all;
                int period = rand() % next->all;
+    int startparen = rand() % next->all;
+    int endparen = rand() % next->all;
+    int startquote = rand() % next->all;
+    int endquote = rand() % next->all;
+    int comma = rand() % next->all;
                if (casing < next->uppercase)
                {
                        transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
@@ -129,11 +217,53 @@ vector<string> kgramstats::randomSentence(int n)
                {
                        nextToken[0] = toupper(nextToken[0]);
                }
+    
+    if ((cur == newKgram) && (rand() % 3 < 2))
+    {
+      nextToken[0] = toupper(nextToken[0]);
+    }
+    
+    if (startquote < next->startquote)
+    {
+      nextToken = "\"" + nextToken;
+    } else if (startparen < next->startparen)
+    {
+      nextToken = "(" + nextToken;
+    }
                
                if (period < next->period)
                {
-                        nextToken += ".";
+      if (endquote < next->endquote)
-                }
+      {
+        nextToken += "\"";
+      } else if (endparen < next->endparen)
+      {
+        nextToken += ")";
+      }
+      
+      int type = rand() % 6;
+      
+      if (type < 3)
+      {
+        nextToken += ".";
+      } else if (type < 5)
+      {
+        nextToken += "!";
+      } else {
+        nextToken += "?";
+      }
+                } else if (comma < next->comma)
+    {
+      if (endquote < next->endquote)
+      {
+        nextToken += "\"";
+      } else if (endparen < next->endparen)
+      {
+        nextToken += ")";
+      }
+      
+      nextToken += ",";
+    }
                if (cur.size() == maxK)
                {
@@ -147,8 +277,22 @@ vector<string> kgramstats::randomSentence(int n)
                }
                
                cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
+    
+    if ((cur == newKgram) || (cur == commaKgram))
+    {
+      cur.pop_front();
+    }
+                
+    if ((period < next->period) && ((rand() % 2) == 0))
+    {
+      cur = newKgram;
+    } else if ((comma < next->comma) && ((rand() % 3) == 0))
+    {
+      cur = commaKgram;
+    } else {
+      cur.push_back(*(next->token));
+    }
                
-                cur.push_back(*(next->token));
                result.push_back(nextToken);
        }
        
@@ -159,10 +303,11 @@ std::string canonize(std::string f)
 {
        string canonical(f);
        transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
-        if (canonical[canonical.length()-1] == '.')
+  
-        {
+  string result;
-                canonical.resize(canonical.find('.'));
+  remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), [] (char c) {
-        }
+    return !((c != '.') && (c != '"') && (c != '(') && (c != ')') && (c != ','));
+  });
        
-        return canonical;
+        return result;
 }
 \ No newline at end of file
diff --git a/kgramstats.h b/kgramstats.h
index b40e1ab..059eb05 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -23,6 +23,11 @@ private:
                int titlecase;
                int uppercase;
                int period;
+    int startquote;
+    int endquote;
+    int startparen;
+    int endparen;
+    int comma;
                string* token;
        } token_data;
        int maxK;
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2015-07-19 22:19:49 -0400
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2015-07-19 22:19:49 -0400
commit	e080a02518103538897be3f09a342712bd42c546 (patch)
tree	97fc06df67d71a4cb3cb8822a76eaf257a25f9db
parent	b71552ea3f8237012fe5677385df1c3534405121 (diff)
download	rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.gz rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.tar.bz2 rawr-ebooks-e080a02518103538897be3f09a342712bd42c546.zip

diff --git a/Makefile.am b/Makefile.am index 299dc10..150ede2 100644 --- a/Makefile.am +++ b/Makefile.am
@@ -4,5 +4,6 @@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS}
4	bin_PROGRAMS = rawr-ebooks rawr-gen	4	bin_PROGRAMS = rawr-ebooks rawr-gen
5	rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp	5	rawr_ebooks_SOURCES = ebooks.cpp kgramstats.cpp freevars.cpp
6	rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp	6	rawr_gen_SOURCES = gen.cpp kgramstats.cpp freevars.cpp
7	rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS)	7	rawr_gen_CPPFLAGS = -std=c++11
		8	rawr_ebooks_CPPFLAGS = $(LIBTWITCURL_CFLAGS) $(YAML_CFLAGS) -std=c++11
8	rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file	9	rawr_ebooks_LDADD = $(LIBTWITCURL_LIBS) $(YAML_LIBS) \ No newline at end of file


diff --git a/kgramstats.cpp b/kgramstats.cpp index 2321b11..1f3dd3c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -28,6 +28,8 @@ kgramstats::kgramstats(string corpus, int maxK)
28	}	28	}
29		29
30	map<kgram, map<string, token_data> > tstats;	30	map<kgram, map<string, token_data> > tstats;
		31	bool newSentence = true;
		32	bool newClause = false;
31	for (int k=0; k<=maxK; k++)	33	for (int k=0; k<=maxK; k++)
32	{	34	{
33	for (int i=0; i<(tokens.size() - k); i++)	35	for (int i=0; i<(tokens.size() - k); i++)
@@ -50,11 +52,83 @@ kgramstats::kgramstats(string corpus, int maxK)
50	token_data* td = tstats[seq]->at(canonical);	52	token_data* td = tstats[seq]->at(canonical);
51	td->token = new string(canonical);	53	td->token = new string(canonical);
52	td->all++;	54	td->all++;
		55
		56	if (newSentence)
		57	{
		58	kgram newKgram(1, ".");
		59	if (tstats[newKgram] == NULL)
		60	{
		61	tstats[newKgram] = new map<string, token_data*>();
		62	}
		63
		64	(*tstats[newKgram])[canonical] = td;
		65
		66	newSentence = false;
		67	}
		68
		69	if (newClause)
		70	{
		71	kgram commaKgram(1, ",");
		72	if (tstats[commaKgram] == NULL)
		73	{
		74	tstats[commaKgram] = new map<string, token_data*>();
		75	}
		76
		77	(*tstats[commaKgram])[canonical] = td;
		78
		79	newClause = false;
		80	}
53		81
54	if ((f.length() > 0) && (f[f.length()-1] == '.'))	82	if ((f.length() > 0) && (f[f.length()-1] == '.'))
55	{	83	{
56	td->period++;	84	td->period++;
		85	newSentence = true;
57	}	86	}
		87
		88	if (f.length() > 0)
		89	{
		90	if (f[0] == '"')
		91	{
		92	td->startquote++;
		93	}
		94
		95	if (f[f.length()-1] == '"')
		96	{
		97	td->endquote++;
		98
		99	if ((f.length() > 1) && (f[f.length()-2] == ','))
		100	{
		101	td->comma++;
		102	newClause = true;
		103	}
		104	}
		105
		106	if (f[f.length()-1] == ',')
		107	{
		108	td->comma++;
		109	newClause = true;
		110
		111	if ((f.length() > 1) && (f[f.length()-2] == '"'))
		112	{
		113	td->endquote++;
		114	}
		115
		116	if ((f.length() > 1) && (f[f.length()-2] == ')'))
		117	{
		118	td->endparen++;
		119	}
		120	}
		121
		122	if (f[0] == '(')
		123	{
		124	td->startparen++;
		125	}
		126
		127	if (f[f.length()-1] == ')')
		128	{
		129	td->endparen++;
		130	}
		131	}
58		132
59	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())	133	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
60	{	134	{
@@ -97,22 +171,31 @@ void printKgram(kgram k)
97	vector<string> kgramstats::randomSentence(int n)	171	vector<string> kgramstats::randomSentence(int n)
98	{	172	{
99	vector<string> result;	173	vector<string> result;
100	list<string> cur;	174	kgram newKgram(1, ".");
		175	kgram commaKgram(1, ",");
		176	list<string> cur = newKgram;
		177	int cuts = 0;
101		178
102	for (int i=0; i<n; i++)	179	for (int i=0; i<n; i++)
103	{	180	{
104	if ((rand() % (maxK - cur.size() + 1)) == 0)	181	if ((cur.size() > 0) && (cur != newKgram))
105	{	182	{
106	for (int i=0; i<cur.size(); i++)	183	if (rand() % (maxK - cur.size() + 1) == 0)
107	{	184	{
108	if ((rand() % 3) == 0)	185	while (cur.size() > 0)
109	{	186	{
110	cur.pop_front();	187	if ((rand() % (n)) < cuts)
111	} else {	188	{
112	break;	189	cur.pop_front();
113	}	190	cuts--;
114	}	191	} else {
115	}	192	break;
		193	}
		194	}
		195	}
		196
		197	cuts++;
		198	}
116		199
117	map<int, token_data> distribution = (*stats)[cur];	200	map<int, token_data> distribution = (*stats)[cur];
118	int max = distribution.rbegin()->first;	201	int max = distribution.rbegin()->first;
@@ -122,6 +205,11 @@ vector<string> kgramstats::randomSentence(int n)
122	string nextToken(*(next->token));	205	string nextToken(*(next->token));
123	int casing = rand() % next->all;	206	int casing = rand() % next->all;
124	int period = rand() % next->all;	207	int period = rand() % next->all;
		208	int startparen = rand() % next->all;
		209	int endparen = rand() % next->all;
		210	int startquote = rand() % next->all;
		211	int endquote = rand() % next->all;
		212	int comma = rand() % next->all;
125	if (casing < next->uppercase)	213	if (casing < next->uppercase)
126	{	214	{
127	transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	215	transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
@@ -129,11 +217,53 @@ vector<string> kgramstats::randomSentence(int n)
129	{	217	{
130	nextToken[0] = toupper(nextToken[0]);	218	nextToken[0] = toupper(nextToken[0]);
131	}	219	}
		220
		221	if ((cur == newKgram) && (rand() % 3 < 2))
		222	{
		223	nextToken[0] = toupper(nextToken[0]);
		224	}
		225
		226	if (startquote < next->startquote)
		227	{
		228	nextToken = "\"" + nextToken;
		229	} else if (startparen < next->startparen)
		230	{
		231	nextToken = "(" + nextToken;
		232	}
132		233
133	if (period < next->period)	234	if (period < next->period)
134	{	235	{
135	nextToken += ".";	236	if (endquote < next->endquote)
136	}	237	{
		238	nextToken += "\"";
		239	} else if (endparen < next->endparen)
		240	{
		241	nextToken += ")";
		242	}
		243
		244	int type = rand() % 6;
		245
		246	if (type < 3)
		247	{
		248	nextToken += ".";
		249	} else if (type < 5)
		250	{
		251	nextToken += "!";
		252	} else {
		253	nextToken += "?";
		254	}
		255	} else if (comma < next->comma)
		256	{
		257	if (endquote < next->endquote)
		258	{
		259	nextToken += "\"";
		260	} else if (endparen < next->endparen)
		261	{
		262	nextToken += ")";
		263	}
		264
		265	nextToken += ",";
		266	}
137		267
138	if (cur.size() == maxK)	268	if (cur.size() == maxK)
139	{	269	{
@@ -147,8 +277,22 @@ vector<string> kgramstats::randomSentence(int n)
147	}	277	}
148		278
149	cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;	279	cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")" << endl;
		280
		281	if ((cur == newKgram) \|\| (cur == commaKgram))
		282	{
		283	cur.pop_front();
		284	}
		285
		286	if ((period < next->period) && ((rand() % 2) == 0))
		287	{
		288	cur = newKgram;
		289	} else if ((comma < next->comma) && ((rand() % 3) == 0))
		290	{
		291	cur = commaKgram;
		292	} else {
		293	cur.push_back(*(next->token));
		294	}
150		295
151	cur.push_back(*(next->token));
152	result.push_back(nextToken);	296	result.push_back(nextToken);
153	}	297	}
154		298
@@ -159,10 +303,11 @@ std::string canonize(std::string f)
159	{	303	{
160	string canonical(f);	304	string canonical(f);
161	transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);	305	transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
162	if (canonical[canonical.length()-1] == '.')	306
163	{	307	string result;
164	canonical.resize(canonical.find('.'));	308	remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), [] (char c) {
165	}	309	return !((c != '.') && (c != '"') && (c != '(') && (c != ')') && (c != ','));
		310	});
166		311
167	return canonical;	312	return result;
168	} \ No newline at end of file	313	} \ No newline at end of file


diff --git a/kgramstats.h b/kgramstats.h index b40e1ab..059eb05 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -23,6 +23,11 @@ private:
23	int titlecase;	23	int titlecase;
24	int uppercase;	24	int uppercase;
25	int period;	25	int period;
		26	int startquote;
		27	int endquote;
		28	int startparen;
		29	int endparen;
		30	int comma;
26	string* token;	31	string* token;
27	} token_data;	32	} token_data;
28	int maxK;	33	int maxK;