Rewrote quite a bit of kgramstats

The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses). The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
commit: 9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree: 9afb52740fe4f618105d014a816df26b36ed83f6 /gen.cpp
parent: 0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download: rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
1 files changed, 46 insertions, 52 deletions
diff --git a/gen.cpp b/gen.cpp
index 7e47d45..400c0a5 100644
--- a/gen.cpp
+++ b/gen.cpp

@@ -11,72 +11,66 @@
 int main(int argc, char** args)
 {
-        srand(time(NULL));
+  srand(time(NULL));
    
-    if (argc == 1)
+  if (argc == 1)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::ifstream infile(args[1]);
+  std::ifstream infile(args[1]);
-    if (!infile)
+  if (!infile)
-    {
+  {
-        std::cout << "rawr-gen, version 1.0" << std::endl;
+    std::cout << "rawr-gen, version 1.0" << std::endl;
-        std::cout << "Usage: rawr-gen corpus-file" << std::endl;
+    std::cout << "Usage: rawr-gen corpus-file" << std::endl;
-        std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
+    std::cout << "  where 'corpus-file' is the path to your input" << std::endl;
-        std::cout << std::endl;
+    std::cout << std::endl;
-        std::cout << "The file you specified does not exist." << std::endl;
+    std::cout << "The file you specified does not exist." << std::endl;
        
-        return 0;
+    return 0;
-    }
+  }
    
-        std::string corpus;
+  std::string corpus;
-        std::string line;
+  std::string line;
-        while (getline(infile, line))
+  while (getline(infile, line))
-        {
+  {
-                corpus += line + "\n ";
+    corpus += line + "\n ";
-        }
+  }
        
-    std::cout << "Preprocessing corpus..." << std::endl;
+  std::cout << "Preprocessing corpus..." << std::endl;
-        kgramstats* stats = new kgramstats(corpus, 4);
+  kgramstats* stats = new kgramstats(corpus, 4);
    
-    std::cout << "Preprocessing freevars..." << std::endl;
+  std::cout << "Preprocessing freevars..." << std::endl;
-    freevars* vars = new freevars();
+  freevars* vars = new freevars();
-    vars->addVar("name", "names.txt");
+  vars->addVar("name", "names.txt");
-    vars->addVar("noun", "nouns.txt");
+  vars->addVar("noun", "nouns.txt");
    
-    std::cout << "Generating..." << std::endl;
+  std::cout << "Generating..." << std::endl;
-        for (;;)
+  for (;;)
-        {
+  {
-                std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45);
+    std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
-                std::string hi;
+    std::string hi;
-                for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
+    for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
-                {
-                        hi += vars->parse(*it) + " ";
-                }
-    
-    size_t firstperiod = hi.find_first_of(".!?");
-    if (firstperiod != std::string::npos)
    {
-      hi = hi.substr(firstperiod+2);
+      hi += vars->parse(*it) + " ";
    }
    
    hi.resize(140);
-                size_t lastperiod = hi.find_last_of(".!?");
+    size_t lastperiod = hi.find_last_of(".!?,");
-                if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
+    if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
-                {
+    {
-                        hi = hi.substr(0, lastperiod+1);
+      hi = hi.substr(0, lastperiod+1);
-                }
+    }
-                std::cout << hi << std::endl;
+    std::cout << hi << std::endl;
                
-        getc(stdin);
+    getc(stdin);
-        }
+  }
        
-        return 0;
+  return 0;
 }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
commit	9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree	9afb52740fe4f618105d014a816df26b36ed83f6 /gen.cpp
parent	0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download	rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2 rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip

diff --git a/gen.cpp b/gen.cpp index 7e47d45..400c0a5 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -11,72 +11,66 @@
11		11
12	int main(int argc, char** args)	12	int main(int argc, char** args)
13	{	13	{
14	srand(time(NULL));	14	srand(time(NULL));
15		15
16	if (argc == 1)	16	if (argc == 1)
17	{	17	{
18	std::cout << "rawr-gen, version 1.0" << std::endl;	18	std::cout << "rawr-gen, version 1.0" << std::endl;
19	std::cout << "Usage: rawr-gen corpus-file" << std::endl;	19	std::cout << "Usage: rawr-gen corpus-file" << std::endl;
20	std::cout << " where 'corpus-file' is the path to your input" << std::endl;	20	std::cout << " where 'corpus-file' is the path to your input" << std::endl;
21		21
22	return 0;	22	return 0;
23	}	23	}
24		24
25	std::ifstream infile(args[1]);	25	std::ifstream infile(args[1]);
26	if (!infile)	26	if (!infile)
27	{	27	{
28	std::cout << "rawr-gen, version 1.0" << std::endl;	28	std::cout << "rawr-gen, version 1.0" << std::endl;
29	std::cout << "Usage: rawr-gen corpus-file" << std::endl;	29	std::cout << "Usage: rawr-gen corpus-file" << std::endl;
30	std::cout << " where 'corpus-file' is the path to your input" << std::endl;	30	std::cout << " where 'corpus-file' is the path to your input" << std::endl;
31	std::cout << std::endl;	31	std::cout << std::endl;
32	std::cout << "The file you specified does not exist." << std::endl;	32	std::cout << "The file you specified does not exist." << std::endl;
33		33
34	return 0;	34	return 0;
35	}	35	}
36		36
37	std::string corpus;	37	std::string corpus;
38	std::string line;	38	std::string line;
39	while (getline(infile, line))	39	while (getline(infile, line))
40	{	40	{
41	corpus += line + "\n ";	41	corpus += line + "\n ";
42	}	42	}
43		43
44	std::cout << "Preprocessing corpus..." << std::endl;	44	std::cout << "Preprocessing corpus..." << std::endl;
45	kgramstats* stats = new kgramstats(corpus, 4);	45	kgramstats* stats = new kgramstats(corpus, 4);
46		46
47	std::cout << "Preprocessing freevars..." << std::endl;	47	std::cout << "Preprocessing freevars..." << std::endl;
48	freevars* vars = new freevars();	48	freevars* vars = new freevars();
49	vars->addVar("name", "names.txt");	49	vars->addVar("name", "names.txt");
50	vars->addVar("noun", "nouns.txt");	50	vars->addVar("noun", "nouns.txt");
51		51
52	std::cout << "Generating..." << std::endl;	52	std::cout << "Generating..." << std::endl;
53	for (;;)	53	for (;;)
54	{	54	{
55	std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 45);	55	std::vector<std::string> doc = stats->randomSentence(rand() % 35 + 15);
56	std::string hi;	56	std::string hi;
57	for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)	57	for (std::vector<std::string>::iterator it = doc.begin(); it != doc.end(); ++it)
58	{
59	hi += vars->parse(*it) + " ";
60	}
61
62	size_t firstperiod = hi.find_first_of(".!?");
63	if (firstperiod != std::string::npos)
64	{	58	{
65	hi = hi.substr(firstperiod+2);	59	hi += vars->parse(*it) + " ";
66	}	60	}
67		61
68	hi.resize(140);	62	hi.resize(140);
69		63
70	size_t lastperiod = hi.find_last_of(".!?");	64	size_t lastperiod = hi.find_last_of(".!?,");
71	if ((lastperiod != std::string::npos) && (rand() % 3 > 0))	65	if ((lastperiod != std::string::npos) && (rand() % 3 > 0))
72	{	66	{
73	hi = hi.substr(0, lastperiod+1);	67	hi = hi.substr(0, lastperiod+1);
74	}	68	}
75		69
76	std::cout << hi << std::endl;	70	std::cout << hi << std::endl;
77		71
78	getc(stdin);	72	getc(stdin);
79	}	73	}
80		74
81	return 0;	75	return 0;
82	}	76	}