Rewrote quite a bit of kgramstats

The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses). The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-04 23:16:17 -0500
commit: 9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree: 9afb52740fe4f618105d014a816df26b36ed83f6 /kgramstats.cpp
parent: 0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download: rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2
rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip
1 files changed, 165 insertions, 277 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index b0ec68a..c88d83c 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -5,237 +5,176 @@
 #include <algorithm>
 #include "malaprop.h"
+query wildcardQuery(querytype_sentence);
 std::string canonize(std::string f);
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
 kgramstats::kgramstats(std::string corpus, int maxK)
 {
-        this->maxK = maxK;
+  this->maxK = maxK;
  
  std::vector<std::string> tokens;
-    size_t start = 0;
+  size_t start = 0;
-        int end = 0;
+  int end = 0;
-        while (end != std::string::npos)
+  while (end != std::string::npos)
-        {
+  {
-           end = corpus.find(" ", start);
+    end = corpus.find(" ", start);
-       std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
-       if (token[token.length()-1] == '\n')
+    if (token[token.length()-1] == '\n')
-       {
+    {
-         if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))
+      if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
-         {
+      {
-           token.insert(token.length()-1, ".");
+        token.insert(token.length()-1, ".");
-         }
+      }
         
-         token.resize(token.length()-1);
+      token.resize(token.length()-1);
-       }
+    }
       
-       if (token.compare("") && token.compare("."))
+    if (token.compare("") && token.compare("."))
-       {
+    {
-         mstats.addWord(token);
+      mstats.addWord(token);
-           tokens.push_back(token);
+      tokens.push_back(token);
-       }
+    }
-           start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
+    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
-        }
+  }
        
-        std::map<kgram, std::map<std::string, token_data*>* > tstats;
+  std::map<kgram, std::map<token, token_data> > tstats;
-  bool newSentence = true;
+  std::map<token, std::map<termstats, int> > tendings;
-  bool newClause = false;
+  for (int k=1; k<maxK; k++)
-        for (int k=0; k<maxK; k++)
+  {
-        {
+    for (int i=0; i<(tokens.size() - k); i++)
-                for (int i=0; i<(tokens.size() - k); i++)
+    {
-                {
+      std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
-                        kgram seq(tokens.begin()+i, tokens.begin()+i+k);
+      kgram prefix;
-                        std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
-                        std::string f = tokens[i+k];
-                        
-      
-      
-      std::string canonical = canonize(f);
-                        
-                        if (tstats[seq] == NULL)
-                        {
-                                tstats[seq] = new std::map<std::string, token_data*>();
-                        }
-                        
-                        if ((*tstats[seq])[canonical] == NULL)
-                        {
-                                (*tstats[seq])[canonical] = (token_data*) calloc(1, sizeof(token_data));
-                        }
-                        token_data* td = tstats[seq]->at(canonical);
-                        td->token = new std::string(canonical);
-                        td->all++;
      
-      /*if (newSentence)
+      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
      {
-        kgram newKgram(1, ".");
+        token word(canonize(*it));
-        if (tstats[newKgram] == NULL)
+        
+        if (it->find_first_of(".?!,") != std::string::npos)
        {
-          tstats[newKgram] = new std::map<std::string, token_data*>();
+          word.terminating = true;
        }
        
-        (*tstats[newKgram])[canonical] = td;
+        prefix.push_back(word);
-        
-        newSentence = false;
      }
      
-      if (newClause)
+      std::string f = tokens[i+k];
+                  std::string canonical = canonize(f);
+      
+      token word(canonical);
+      if (f.find_first_of(".?!,") != std::string::npos)
      {
-        kgram commaKgram(1, ",");
+        word.terminating = true;
-        if (tstats[commaKgram] == NULL)
-        {
-          tstats[commaKgram] = new std::map<std::string, token_data*>();
-        }
        
-        (*tstats[commaKgram])[canonical] = td;
+        char terminator = f[f.find_last_of(".?!,")];
+        int occurrences = std::count(f.begin(), f.end(), terminator);
        
-        newClause = false;
+        tendings[word][termstats(terminator, occurrences)]++;
-      }
-      
-      if ((f.length() > 0) && (f[f.length()-1] == '\n'))
-      {
-        td->period++;
-        newSentence = true;
-        f.resize(f.length()-1);
      }
                        
-      if (f.length() > 0)
+      token_data& td = tstats[prefix][word];
+      td.word = word;
+      td.all++;
+      if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
      {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
+        td.uppercase++;
-                        {
+      } else if (isupper(f[0]))
-          if (!newSentence)
+      {
-          {
+        td.titlecase++;
-                                td->period++;
-            newSentence = true;
-          }
-                                
-          f.resize(f.length()-1);
-                        } else if (f[f.length()-1] == ',')
-        {
-          if (!newSentence)
-          {
-            td->comma++;
-            newClause = true;
-          }
-          
-          f.resize(f.length()-1);
-        }
      }
      
-      if (f.length() > 0)
+      if (prefix.front().word.terminating)
      {
-        if (f[0] == '"')
+        prefix.front() = wildcardQuery;
-        {
-          td->startquote++;
-        }
        
-        if (f[0] == '(')
+        token_data& td2 = tstats[prefix][word];
+        td2.word = word;
+        td2.all++;
+        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
        {
-          td->startparen++;
+          td2.uppercase++;
-        }
+        } else if (isupper(f[0]))
-        
-        if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
        {
-          if (f[f.length()-1] == '"')
+          td2.titlecase++;
-          {
-            td->endquote++;
-          } else if (f[f.length()-1] == ')')
-          {
-            td->endparen++;
-          }
-          
-          f.resize(f.length()-1);
-          
-          if (f.length() > 0)
-          {
-                        if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
-                        {
-              if (!newSentence)
-              {
-                                        td->period++;
-                newSentence = true;
-              }
-                        } else if (f[f.length()-1] == ',')
-            {
-              if (!newSentence && !newClause)
-              {
-                td->comma++;
-                newClause = true;
-              }
-            }
-          }
-        }
-      }*/
-                        
-                        if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
-                        {
-                                td->uppercase++;
-                        } else if (isupper(f[0]))
-                        {
-                                td->titlecase++;
-                        }
-      
-      /*if (k != 0)
-      {
-        if (newSentence)
-        {
-          i += k;
        }
-        
+      }
-        newSentence = false;
+    }
-        newClause = false;
+  }
-      }*/
-                }
-        }
        
-        stats = new std::map<kgram, std::map<int, token_data*>* >();
+  for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
-        for (std::map<kgram, std::map<std::string, token_data*>* >::iterator it = tstats.begin(); it != tstats.end(); it++)
+  {
-        {
+    kgram klist = it->first;
-                kgram klist = it->first;
+    std::map<token, token_data>& probtable = it->second;
-                std::map<std::string, token_data*>* probtable = it->second;
+    std::map<int, token_data>& distribution = stats[klist];
-                std::map<int, token_data*>* distribution = new std::map<int, token_data*>();
+    int max = 0;
-        int max = 0;
                
-                for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)
+    for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
-                {
+    {
-                        max += kt->second->all;
+      max += kt->second.all;
                        
-                        (*distribution)[max] = kt->second;
+      distribution[max] = kt->second;
-                }
+    }
-                
+  }
-                (*stats)[klist] = distribution;
+  
-        }
+  for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
+  {
+    token word = it->first;
+    std::map<termstats, int>& probtable = it->second;
+    std::map<int, termstats>& distribution = endings[word];
+    int max = 0;
+    
+    for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
+    {
+      max += kt->second;
+      
+      distribution[max] = kt->first;
+    }
+  }
 }
 void printKgram(kgram k)
 {
-        for (kgram::iterator it = k.begin(); it != k.end(); it++)
+  for (kgram::iterator it = k.begin(); it != k.end(); it++)
-        {
+  {
-                std::cout << *it << " ";
+    query& q = *it;
-        }
+    if (q.type == querytype_sentence)
+    {
+      std::cout << "#.# ";
+    } else if (q.type == querytype_literal)
+    {
+      if (q.word.terminating)
+      {
+        std::cout << q.word.canon << ". ";
+      } else {
+        std::cout << q.word.canon << " ";
+      }
+    }
+  }
 }
 // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
 std::vector<std::string> kgramstats::randomSentence(int n)
 {
-        std::vector<std::string> result;
+  std::vector<std::string> result;
-  kgram newKgram(1, ".");
+  kgram cur(1, wildcardQuery);
-  kgram commaKgram(1, ",");
-        std::list<std::string> cur;
  int cuts = 0;
        
-        for (int i=0; i<n; i++)
+  for (int i=0; i<n; i++)
-        {
+  {
-                if (cur.size() == maxK)
+    if (cur.size() == maxK)
-                {
+    {
-                        cur.pop_front();
+      cur.pop_front();
-                }
+    }
    
-    if ((cur.size() > 0) && (cur != newKgram))
+    if (cur.size() > 0)
    {
      if (rand() % (maxK - cur.size() + 1) == 0)
      {
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      
      cuts++;
    }
+    
+    // Gotta circumvent the last line of the input corpus
+    // https://twitter.com/starla4444/status/684222271339237376
+    if (stats.count(cur) == 0)
+    {
+      cur = kgram(1, wildcardQuery);
+    }
-                std::map<int, token_data*> distribution = *(*stats)[cur];
+    std::map<int, token_data>& distribution = stats[cur];
-                int max = distribution.rbegin()->first;
+    int max = distribution.rbegin()->first;
-                int r = rand() % max;
+    int r = rand() % max;
-                token_data* next = distribution.upper_bound(r)->second;
+    token_data& next = distribution.upper_bound(r)->second;
+    std::string nextToken(next.word.canon);
-                std::string nextToken(*(next->token));
-                int casing = rand() % next->all;
-                /*int period = rand() % next->all;
-    int startparen = rand() % next->all;
-    int endparen = rand() % next->all;
-    int startquote = rand() % next->all;
-    int endquote = rand() % next->all;
-    int comma = rand() % next->all;*/
    
    bool mess = (rand() % 100) == 0;
    if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
      nextToken = mstats.alternate(nextToken);
    }
    
-                if (casing < next->uppercase)
+    // Determine the casing of the next token. We randomly make the token all
-                {
+    // caps based on the markov chain. Otherwise, we check if the previous
-                        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    // token is the end of a sentence (terminating token or a wildcard query).
-                }
+    int casing = rand() % next.all;
-    
+    if (casing < next.uppercase)
-    if ((cur == newKgram) && (rand() % 15 > 0))
+    {
+      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+    } else if ((((cur.rbegin()->type == querytype_sentence)
+          || ((cur.rbegin()->type == querytype_literal)
+            && (cur.rbegin()->word.terminating)))
+        && (rand() % 2 > 0))
+      || (casing - next.uppercase < next.titlecase))
    {
      nextToken[0] = toupper(nextToken[0]);
    }
-    /*if (startquote < next->startquote)
+    if (next.word.terminating)
-    {
-      nextToken = "\"" + nextToken;
-    } else if (startparen < next->startparen)
    {
-      nextToken = "(" + nextToken;
+      std::map<int, termstats>& ending = endings[next.word];
+      int emax = ending.rbegin()->first;
+      int er = rand() % emax;
+      termstats& nextend = ending.upper_bound(er)->second;
+      
+      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
-        
-                if (period < next->period)
-                {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      int type = rand() % 6;
-    
-      if (type < 3)
-      {
-        nextToken += ".";
-      } else if (type < 5)
-      {
-        nextToken += "!";
-      } else {
-        nextToken += "?";
-      }
-                } else if (comma < next->comma)
-    {
-      if (endquote < next->endquote)
-      {
-        nextToken += "\"";
-      } else if (endparen < next->endparen)
-      {
-        nextToken += ")";
-      }
-    
-      nextToken += ",";
-    }*/
                
-                /* DEBUG */
+    /* DEBUG */
-                for (kgram::iterator it = cur.begin(); it != cur.end(); it++)
+    printKgram(cur);
-                {
-                        std::cout << *it << " ";
-                }
                
-                std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";
+    std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
    
    if (mess)
    {
-      std::cout << " mala " << *(next->token);
+      std::cout << " mala " << next.word.canon;
    }
    
    std::cout << std::endl;
-    
-    /*if ((cur == newKgram) || (cur == commaKgram))
+    cur.push_back(next.word);
-    {
-      cur.pop_front();
-    }
-                
-    if (period < next->period)// && ((rand() % 3) != 0))
-    {
-      cur = newKgram;
-    } else if ((comma < next->comma) && ((rand() % 3) == 0))
-    {
-      cur = commaKgram;
-    } else {*/
-      //if (mess && (rand() % 2 == 0))
-      if (false)
-      {
-        // This doesn't work because sometimes the alternate token isn't actually present in the original corpus
-        cur.clear();
-        cur.push_back(nextToken);
-      } else {
-        cur.push_back(*(next->token));
-      }
-      //}
                
-                result.push_back(nextToken);
+    result.push_back(nextToken);
-        }
+  }
        
-        return result;
+  return result;
 }
 bool removeIf(char c)
 {
-  return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));
+  return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /*&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')*/);
 }
 std::string canonize(std::string f)
 {
-        std::string canonical(f);
+  std::string canonical(f);
-        std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
+  std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
  
  std::string result;
  std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
        
-        return canonical;
+  return result;
 }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-04 23:16:17 -0500
commit	9e89002477d1358de9be9cabdc1edba26bd32836 (patch)
tree	9afb52740fe4f618105d014a816df26b36ed83f6 /kgramstats.cpp
parent	0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff)
download	rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2 rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip

diff --git a/kgramstats.cpp b/kgramstats.cpp index b0ec68a..c88d83c 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -5,237 +5,176 @@
5	#include <algorithm>	5	#include <algorithm>
6	#include "malaprop.h"	6	#include "malaprop.h"
7		7
		8	query wildcardQuery(querytype_sentence);
		9
8	std::string canonize(std::string f);	10	std::string canonize(std::string f);
9		11
10	// runs in O(t^2) time where t is the number of tokens in the input corpus	12	// runs in O(t^2) time where t is the number of tokens in the input corpus
11	// We consider maxK to be fairly constant	13	// We consider maxK to be fairly constant
12	kgramstats::kgramstats(std::string corpus, int maxK)	14	kgramstats::kgramstats(std::string corpus, int maxK)
13	{	15	{
14	this->maxK = maxK;	16	this->maxK = maxK;
15		17
16	std::vector<std::string> tokens;	18	std::vector<std::string> tokens;
17	size_t start = 0;	19	size_t start = 0;
18	int end = 0;	20	int end = 0;
19		21
20	while (end != std::string::npos)	22	while (end != std::string::npos)
21	{	23	{
22	end = corpus.find(" ", start);	24	end = corpus.find(" ", start);
23		25
24	std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);	26	std::string token = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
25	if (token[token.length()-1] == '\n')	27	if (token[token.length()-1] == '\n')
26	{	28	{
27	if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?'))	29	if ((token[token.length()-2] != '.') && (token[token.length()-2] != '!') && (token[token.length()-2] != '?') && (token[token.length()-2] != ','))
28	{	30	{
29	token.insert(token.length()-1, ".");	31	token.insert(token.length()-1, ".");
30	}	32	}
31		33
32	token.resize(token.length()-1);	34	token.resize(token.length()-1);
33	}	35	}
34		36
35	if (token.compare("") && token.compare("."))	37	if (token.compare("") && token.compare("."))
36	{	38	{
37	mstats.addWord(token);	39	mstats.addWord(token);
38	tokens.push_back(token);	40	tokens.push_back(token);
39	}	41	}
40		42
41	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	43	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
42	}	44	}
43		45
44	std::map<kgram, std::map<std::string, token_data> > tstats;	46	std::map<kgram, std::map<token, token_data> > tstats;
45	bool newSentence = true;	47	std::map<token, std::map<termstats, int> > tendings;
46	bool newClause = false;	48	for (int k=1; k<maxK; k++)
47	for (int k=0; k<maxK; k++)	49	{
48	{	50	for (int i=0; i<(tokens.size() - k); i++)
49	for (int i=0; i<(tokens.size() - k); i++)	51	{
50	{	52	std::list<std::string> seq(tokens.begin()+i, tokens.begin()+i+k);
51	kgram seq(tokens.begin()+i, tokens.begin()+i+k);	53	kgram prefix;
52	std::transform(seq.begin(), seq.end(), seq.begin(), canonize);
53	std::string f = tokens[i+k];
54
55
56
57	std::string canonical = canonize(f);
58
59	if (tstats[seq] == NULL)
60	{
61	tstats[seq] = new std::map<std::string, token_data*>();
62	}
63
64	if ((*tstats[seq])[canonical] == NULL)
65	{
66	(tstats[seq])[canonical] = (token_data) calloc(1, sizeof(token_data));
67	}
68
69	token_data* td = tstats[seq]->at(canonical);
70	td->token = new std::string(canonical);
71	td->all++;
72		54
73	/*if (newSentence)	55	for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
74	{	56	{
75	kgram newKgram(1, ".");	57	token word(canonize(*it));
76	if (tstats[newKgram] == NULL)	58
		59	if (it->find_first_of(".?!,") != std::string::npos)
77	{	60	{
78	tstats[newKgram] = new std::map<std::string, token_data*>();	61	word.terminating = true;
79	}	62	}
80		63
81	(*tstats[newKgram])[canonical] = td;	64	prefix.push_back(word);
82
83	newSentence = false;
84	}	65	}
85		66
86	if (newClause)	67	std::string f = tokens[i+k];
		68	std::string canonical = canonize(f);
		69
		70	token word(canonical);
		71	if (f.find_first_of(".?!,") != std::string::npos)
87	{	72	{
88	kgram commaKgram(1, ",");	73	word.terminating = true;
89	if (tstats[commaKgram] == NULL)
90	{
91	tstats[commaKgram] = new std::map<std::string, token_data*>();
92	}
93		74
94	(*tstats[commaKgram])[canonical] = td;	75	char terminator = f[f.find_last_of(".?!,")];
		76	int occurrences = std::count(f.begin(), f.end(), terminator);
95		77
96	newClause = false;	78	tendings[word][termstats(terminator, occurrences)]++;
97	}
98
99	if ((f.length() > 0) && (f[f.length()-1] == '\n'))
100	{
101	td->period++;
102	newSentence = true;
103	f.resize(f.length()-1);
104	}	79	}
105		80
106	if (f.length() > 0)	81	token_data& td = tstats[prefix][word];
		82	td.word = word;
		83	td.all++;
		84
		85	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
107	{	86	{
108	if ((f[f.length()-1] == '.') \|\| (f[f.length()-1] == '!') \|\| (f[f.length()-1] == '?'))	87	td.uppercase++;
109	{	88	} else if (isupper(f[0]))
110	if (!newSentence)	89	{
111	{	90	td.titlecase++;
112	td->period++;
113	newSentence = true;
114	}
115
116	f.resize(f.length()-1);
117	} else if (f[f.length()-1] == ',')
118	{
119	if (!newSentence)
120	{
121	td->comma++;
122	newClause = true;
123	}
124
125	f.resize(f.length()-1);
126	}
127	}	91	}
128		92
129	if (f.length() > 0)	93	if (prefix.front().word.terminating)
130	{	94	{
131	if (f[0] == '"')	95	prefix.front() = wildcardQuery;
132	{
133	td->startquote++;
134	}
135		96
136	if (f[0] == '(')	97	token_data& td2 = tstats[prefix][word];
		98	td2.word = word;
		99	td2.all++;
		100
		101	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
137	{	102	{
138	td->startparen++;	103	td2.uppercase++;
139	}	104	} else if (isupper(f[0]))
140
141	if ((f[f.length()-1] == '"') \|\| (f[f.length()-1] == ')'))
142	{	105	{
143	if (f[f.length()-1] == '"')	106	td2.titlecase++;
144	{
145	td->endquote++;
146	} else if (f[f.length()-1] == ')')
147	{
148	td->endparen++;
149	}
150
151	f.resize(f.length()-1);
152
153	if (f.length() > 0)
154	{
155	if ((f[f.length()-1] == '.') \|\| (f[f.length()-1] == '!') \|\| (f[f.length()-1] == '?'))
156	{
157	if (!newSentence)
158	{
159	td->period++;
160	newSentence = true;
161	}
162	} else if (f[f.length()-1] == ',')
163	{
164	if (!newSentence && !newClause)
165	{
166	td->comma++;
167	newClause = true;
168	}
169	}
170	}
171	}
172	}*/
173
174	if (std::find_if(f.begin(), f.end(), ::islower) == f.end())
175	{
176	td->uppercase++;
177	} else if (isupper(f[0]))
178	{
179	td->titlecase++;
180	}
181
182	/*if (k != 0)
183	{
184	if (newSentence)
185	{
186	i += k;
187	}	107	}
188		108	}
189	newSentence = false;	109	}
190	newClause = false;	110	}
191	}*/
192	}
193	}
194		111
195	stats = new std::map<kgram, std::map<int, token_data> >();	112	for (std::map<kgram, std::map<token, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)
196	for (std::map<kgram, std::map<std::string, token_data> >::iterator it = tstats.begin(); it != tstats.end(); it++)	113	{
197	{	114	kgram klist = it->first;
198	kgram klist = it->first;	115	std::map<token, token_data>& probtable = it->second;
199	std::map<std::string, token_data> probtable = it->second;	116	std::map<int, token_data>& distribution = stats[klist];
200	std::map<int, token_data> distribution = new std::map<int, token_data*>();	117	int max = 0;
201	int max = 0;
202		118
203	for (std::map<std::string, token_data*>::iterator kt = probtable->begin(); kt != probtable->end(); kt++)	119	for (std::map<token, token_data>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
204	{	120	{
205	max += kt->second->all;	121	max += kt->second.all;
206		122
207	(*distribution)[max] = kt->second;	123	distribution[max] = kt->second;
208	}	124	}
209		125	}
210	(*stats)[klist] = distribution;	126
211	}	127	for (std::map<token, std::map<termstats, int> >::iterator it = tendings.begin(); it != tendings.end(); it++)
		128	{
		129	token word = it->first;
		130	std::map<termstats, int>& probtable = it->second;
		131	std::map<int, termstats>& distribution = endings[word];
		132	int max = 0;
		133
		134	for (std::map<termstats, int>::iterator kt = probtable.begin(); kt != probtable.end(); kt++)
		135	{
		136	max += kt->second;
		137
		138	distribution[max] = kt->first;
		139	}
		140	}
212	}	141	}
213		142
214	void printKgram(kgram k)	143	void printKgram(kgram k)
215	{	144	{
216	for (kgram::iterator it = k.begin(); it != k.end(); it++)	145	for (kgram::iterator it = k.begin(); it != k.end(); it++)
217	{	146	{
218	std::cout << *it << " ";	147	query& q = *it;
219	}	148	if (q.type == querytype_sentence)
		149	{
		150	std::cout << "#.# ";
		151	} else if (q.type == querytype_literal)
		152	{
		153	if (q.word.terminating)
		154	{
		155	std::cout << q.word.canon << ". ";
		156	} else {
		157	std::cout << q.word.canon << " ";
		158	}
		159	}
		160	}
220	}	161	}
221		162
222	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus	163	// runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus
223	std::vector<std::string> kgramstats::randomSentence(int n)	164	std::vector<std::string> kgramstats::randomSentence(int n)
224	{	165	{
225	std::vector<std::string> result;	166	std::vector<std::string> result;
226	kgram newKgram(1, ".");	167	kgram cur(1, wildcardQuery);
227	kgram commaKgram(1, ",");
228	std::list<std::string> cur;
229	int cuts = 0;	168	int cuts = 0;
230		169
231	for (int i=0; i<n; i++)	170	for (int i=0; i<n; i++)
232	{	171	{
233	if (cur.size() == maxK)	172	if (cur.size() == maxK)
234	{	173	{
235	cur.pop_front();	174	cur.pop_front();
236	}	175	}
237		176
238	if ((cur.size() > 0) && (cur != newKgram))	177	if (cur.size() > 0)
239	{	178	{
240	if (rand() % (maxK - cur.size() + 1) == 0)	179	if (rand() % (maxK - cur.size() + 1) == 0)
241	{	180	{
@@ -253,20 +192,19 @@ std::vector<std::string> kgramstats::randomSentence(int n)
253		192
254	cuts++;	193	cuts++;
255	}	194	}
		195
		196	// Gotta circumvent the last line of the input corpus
		197	// https://twitter.com/starla4444/status/684222271339237376
		198	if (stats.count(cur) == 0)
		199	{
		200	cur = kgram(1, wildcardQuery);
		201	}
256		202
257	std::map<int, token_data> distribution = (*stats)[cur];	203	std::map<int, token_data>& distribution = stats[cur];
258	int max = distribution.rbegin()->first;	204	int max = distribution.rbegin()->first;
259	int r = rand() % max;	205	int r = rand() % max;
260	token_data* next = distribution.upper_bound(r)->second;	206	token_data& next = distribution.upper_bound(r)->second;
261		207	std::string nextToken(next.word.canon);
262	std::string nextToken(*(next->token));
263	int casing = rand() % next->all;
264	/*int period = rand() % next->all;
265	int startparen = rand() % next->all;
266	int endparen = rand() % next->all;
267	int startquote = rand() % next->all;
268	int endquote = rand() % next->all;
269	int comma = rand() % next->all;*/
270		208
271	bool mess = (rand() % 100) == 0;	209	bool mess = (rand() % 100) == 0;
272	if (mess)	210	if (mess)
@@ -274,114 +212,64 @@ std::vector<std::string> kgramstats::randomSentence(int n)
274	nextToken = mstats.alternate(nextToken);	212	nextToken = mstats.alternate(nextToken);
275	}	213	}
276		214
277	if (casing < next->uppercase)	215	// Determine the casing of the next token. We randomly make the token all
278	{	216	// caps based on the markov chain. Otherwise, we check if the previous
279	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	217	// token is the end of a sentence (terminating token or a wildcard query).
280	}	218	int casing = rand() % next.all;
281		219	if (casing < next.uppercase)
282	if ((cur == newKgram) && (rand() % 15 > 0))	220	{
		221	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
		222	} else if ((((cur.rbegin()->type == querytype_sentence)
		223	\|\| ((cur.rbegin()->type == querytype_literal)
		224	&& (cur.rbegin()->word.terminating)))
		225	&& (rand() % 2 > 0))
		226	\|\| (casing - next.uppercase < next.titlecase))
283	{	227	{
284	nextToken[0] = toupper(nextToken[0]);	228	nextToken[0] = toupper(nextToken[0]);
285	}	229	}
286		230
287	/*if (startquote < next->startquote)	231	if (next.word.terminating)
288	{
289	nextToken = "\"" + nextToken;
290	} else if (startparen < next->startparen)
291	{	232	{
292	nextToken = "(" + nextToken;	233	std::map<int, termstats>& ending = endings[next.word];
		234	int emax = ending.rbegin()->first;
		235	int er = rand() % emax;
		236	termstats& nextend = ending.upper_bound(er)->second;
		237
		238	nextToken.append(std::string(nextend.occurrences, nextend.terminator));
293	}	239	}
294
295	if (period < next->period)
296	{
297	if (endquote < next->endquote)
298	{
299	nextToken += "\"";
300	} else if (endparen < next->endparen)
301	{
302	nextToken += ")";
303	}
304
305	int type = rand() % 6;
306
307	if (type < 3)
308	{
309	nextToken += ".";
310	} else if (type < 5)
311	{
312	nextToken += "!";
313	} else {
314	nextToken += "?";
315	}
316	} else if (comma < next->comma)
317	{
318	if (endquote < next->endquote)
319	{
320	nextToken += "\"";
321	} else if (endparen < next->endparen)
322	{
323	nextToken += ")";
324	}
325
326	nextToken += ",";
327	}*/
328		240
329	/* DEBUG */	241	/* DEBUG */
330	for (kgram::iterator it = cur.begin(); it != cur.end(); it++)	242	printKgram(cur);
331	{
332	std::cout << *it << " ";
333	}
334		243
335	std::cout << "-> \"" << nextToken << "\" (" << next->all << "/" << max << ")";	244	std::cout << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")";
336		245
337	if (mess)	246	if (mess)
338	{	247	{
339	std::cout << " mala " << *(next->token);	248	std::cout << " mala " << next.word.canon;
340	}	249	}
341		250
342	std::cout << std::endl;	251	std::cout << std::endl;
343		252
344	/*if ((cur == newKgram) \|\| (cur == commaKgram))	253	cur.push_back(next.word);
345	{
346	cur.pop_front();
347	}
348
349	if (period < next->period)// && ((rand() % 3) != 0))
350	{
351	cur = newKgram;
352	} else if ((comma < next->comma) && ((rand() % 3) == 0))
353	{
354	cur = commaKgram;
355	} else {*/
356	//if (mess && (rand() % 2 == 0))
357	if (false)
358	{
359	// This doesn't work because sometimes the alternate token isn't actually present in the original corpus
360	cur.clear();
361	cur.push_back(nextToken);
362	} else {
363	cur.push_back(*(next->token));
364	}
365	//}
366		254
367	result.push_back(nextToken);	255	result.push_back(nextToken);
368	}	256	}
369		257
370	return result;	258	return result;
371	}	259	}
372		260
373	bool removeIf(char c)	261	bool removeIf(char c)
374	{	262	{
375	return !((c != '.') && (c != '?') && (c != '!') && (c != '"') && (c != '(') && (c != ')') && (c != ',') && (c != '\n'));	263	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') /&& (c != '"') && (c != '(') && (c != ')') && (c != '\n')/);
376	}	264	}
377		265
378	std::string canonize(std::string f)	266	std::string canonize(std::string f)
379	{	267	{
380	std::string canonical(f);	268	std::string canonical(f);
381	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);	269	std::transform(canonical.begin(), canonical.end(), canonical.begin(), ::tolower);
382		270
383	std::string result;	271	std::string result;
384	std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);	272	std::remove_copy_if(canonical.begin(), canonical.end(), std::back_inserter(result), removeIf);
385		273
386	return canonical;	274	return result;
387	}	275	}