2 files changed, 102 insertions, 37 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 258e92a..4bb7f15 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence);
 std::string canonize(std::string f);
+token token_from_string(std::string in)
+{
+  if (in[0] == '#')
+  {
+    token word(tokentype_hashtag);
+    
+    if (in.find_first_of(".?!,") != std::string::npos)
+    {
+      word.terminating = true;
+    }
+    
+    return word;
+  } else {
+    token word(canonize(in));
+  
+    if (in.find_first_of(".?!,") != std::string::npos)
+    {
+      word.terminating = true;
+    }
+    
+    return word;
+  }
+}
 // runs in O(t^2) time where t is the number of tokens in the input corpus
 // We consider maxK to be fairly constant
 kgramstats::kgramstats(std::string corpus, int maxK)
@@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  std::vector<std::string> tokens;
  size_t start = 0;
  int end = 0;
+  std::set<std::string> thashtags;
  while (end != std::string::npos)
  {
@@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK)
    {
      mstats.addWord(token);
      tokens.push_back(token);
+      
+      if (token[0] == '#')
+      {
+        thashtags.insert(canonize(token));
+      }
    }
    start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
  }
+  
+  for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
+  {
+    hashtags.push_back(*it);
+  }
        
  std::map<kgram, std::map<token, token_data> > tstats;
  std::map<token, std::map<termstats, int> > tendings;
@@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
      
      for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
      {
-        token word(canonize(*it));
+        prefix.push_back(token_from_string(*it));
-        
-        if (it->find_first_of(".?!,") != std::string::npos)
-        {
-          word.terminating = true;
-        }
-        
-        prefix.push_back(word);
      }
      
      std::string f = tokens[i+k];
                  std::string canonical = canonize(f);
      
-      token word(canonical);
+      token word(token_from_string(canonical));
      if (f.find_first_of(".?!,") != std::string::npos)
      {
        word.terminating = true;
@@ -184,11 +212,22 @@ void printKgram(kgram k)
      std::cout << "#.# ";
    } else if (q.type == querytype_literal)
    {
-      if (q.word.terminating)
+      if (q.word.type == tokentype_hashtag)
+      {
+        if (q.word.terminating)
+        {
+          std::cout << "#hashtag. ";
+        } else {
+          std::cout << "#hashtag ";
+        }
+      } else if (q.word.type == tokentype_literal)
      {
-        std::cout << q.word.canon << ". ";
+        if (q.word.terminating)
-      } else {
+        {
-        std::cout << q.word.canon << " ";
+          std::cout << q.word.canon << ". ";
+        } else {
+          std::cout << q.word.canon << " ";
+        }
      }
    }
  }
@@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n)
    int max = distribution.rbegin()->first;
    int r = rand() % max;
    token_data& next = distribution.upper_bound(r)->second;
-    std::string nextToken(next.word.canon);
+    std::string nextToken;
+    bool mess = false;
    
-    bool mess = (rand() % 100) == 0;
+    if (next.word.type == tokentype_literal)
-    if (mess)
    {
-      nextToken = mstats.alternate(nextToken);
+      nextToken = next.word.canon;
-    }
    
-    // Determine the casing of the next token. We randomly make the token all
+      mess = (rand() % 100) == 0;
-    // caps based on the markov chain. Otherwise, we check if the previous
+      if (mess)
-    // token is the end of a sentence (terminating token or a wildcard query).
+      {
-    int casing = rand() % next.all;
+        nextToken = mstats.alternate(nextToken);
-    if (casing < next.uppercase)
+      }
-    {
+    
-      std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+      // Determine the casing of the next token. We randomly make the token all
-    } else if ((((cur.rbegin()->type == querytype_sentence)
+      // caps based on the markov chain. Otherwise, we check if the previous
-          || ((cur.rbegin()->type == querytype_literal)
+      // token is the end of a sentence (terminating token or a wildcard query).
-            && (cur.rbegin()->word.terminating)))
+      int casing = rand() % next.all;
-        && (rand() % 2 > 0))
+      if (casing < next.uppercase)
-      || (casing - next.uppercase < next.titlecase))
+      {
+        std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
+      } else if ((((cur.rbegin()->type == querytype_sentence)
+            || ((cur.rbegin()->type == querytype_literal)
+              && (cur.rbegin()->word.terminating)))
+          && (rand() % 2 > 0))
+        || (casing - next.uppercase < next.titlecase))
+      {
+        nextToken[0] = toupper(nextToken[0]);
+      }
+    } else if (next.word.type == tokentype_hashtag)
    {
-      nextToken[0] = toupper(nextToken[0]);
+      int rhash = rand() % hashtags.size();
+      nextToken = hashtags[rhash];
    }
+    
    if (next.word.terminating)
    {
      std::map<int, termstats>& ending = endings[next.word];
      int emax = ending.rbegin()->first;
      int er = rand() % emax;
      termstats& nextend = ending.upper_bound(er)->second;
-      
+    
      nextToken.append(std::string(nextend.occurrences, nextend.terminator));
    }
                
diff --git a/kgramstats.h b/kgramstats.h
index ca61df7..ff2fc66 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -7,19 +7,34 @@
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
+enum tokentype {
+  tokentype_literal,
+  tokentype_hashtag
+};
 struct token {
+  tokentype type;
  std::string canon;
  bool terminating;
  
-  token(std::string canon) : canon(canon), terminating(false) {}
+  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
+  token(tokentype type) : type(type), canon(""), terminating(false) {}
  
  bool operator<(const token& other) const
  {
-    if (canon == other.canon)
+    if (type != other.type)
    {
-      return !terminating && other.terminating;
+      return type < other.type;
+    } else if (type == tokentype_literal)
+    {
+      if (canon == other.canon)
+      {
+        return !terminating && other.terminating;
+      } else {
+        return canon < other.canon;
+      }
    } else {
-      return canon < other.canon;
+      return !terminating && other.terminating;
    }
  }
 };
@@ -94,6 +109,7 @@ private:
        std::map<kgram, std::map<int, token_data> > stats;
  malaprop mstats;
  std::map<token, std::map<int, termstats> > endings;
+  std::vector<std::string> hashtags;
 };
 void printKgram(kgram k);

diff --git a/kgramstats.cpp b/kgramstats.cpp index 258e92a..4bb7f15 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -43,6 +43,30 @@ query wildcardQuery(querytype_sentence);
43		43
44	std::string canonize(std::string f);	44	std::string canonize(std::string f);
45		45
		46	token token_from_string(std::string in)
		47	{
		48	if (in[0] == '#')
		49	{
		50	token word(tokentype_hashtag);
		51
		52	if (in.find_first_of(".?!,") != std::string::npos)
		53	{
		54	word.terminating = true;
		55	}
		56
		57	return word;
		58	} else {
		59	token word(canonize(in));
		60
		61	if (in.find_first_of(".?!,") != std::string::npos)
		62	{
		63	word.terminating = true;
		64	}
		65
		66	return word;
		67	}
		68	}
		69
46	// runs in O(t^2) time where t is the number of tokens in the input corpus	70	// runs in O(t^2) time where t is the number of tokens in the input corpus
47	// We consider maxK to be fairly constant	71	// We consider maxK to be fairly constant
48	kgramstats::kgramstats(std::string corpus, int maxK)	72	kgramstats::kgramstats(std::string corpus, int maxK)
@@ -52,6 +76,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
52	std::vector<std::string> tokens;	76	std::vector<std::string> tokens;
53	size_t start = 0;	77	size_t start = 0;
54	int end = 0;	78	int end = 0;
		79	std::set<std::string> thashtags;
55		80
56	while (end != std::string::npos)	81	while (end != std::string::npos)
57	{	82	{
@@ -72,10 +97,20 @@ kgramstats::kgramstats(std::string corpus, int maxK)
72	{	97	{
73	mstats.addWord(token);	98	mstats.addWord(token);
74	tokens.push_back(token);	99	tokens.push_back(token);
		100
		101	if (token[0] == '#')
		102	{
		103	thashtags.insert(canonize(token));
		104	}
75	}	105	}
76		106
77	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);	107	start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1);
78	}	108	}
		109
		110	for (std::set<std::string>::iterator it = thashtags.begin(); it != thashtags.end(); it++)
		111	{
		112	hashtags.push_back(*it);
		113	}
79		114
80	std::map<kgram, std::map<token, token_data> > tstats;	115	std::map<kgram, std::map<token, token_data> > tstats;
81	std::map<token, std::map<termstats, int> > tendings;	116	std::map<token, std::map<termstats, int> > tendings;
@@ -88,20 +123,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
88		123
89	for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)	124	for (std::list<std::string>::iterator it = seq.begin(); it != seq.end(); it++)
90	{	125	{
91	token word(canonize(*it));	126	prefix.push_back(token_from_string(*it));
92
93	if (it->find_first_of(".?!,") != std::string::npos)
94	{
95	word.terminating = true;
96	}
97
98	prefix.push_back(word);
99	}	127	}
100		128
101	std::string f = tokens[i+k];	129	std::string f = tokens[i+k];
102	std::string canonical = canonize(f);	130	std::string canonical = canonize(f);
103		131
104	token word(canonical);	132	token word(token_from_string(canonical));
105	if (f.find_first_of(".?!,") != std::string::npos)	133	if (f.find_first_of(".?!,") != std::string::npos)
106	{	134	{
107	word.terminating = true;	135	word.terminating = true;
@@ -184,11 +212,22 @@ void printKgram(kgram k)
184	std::cout << "#.# ";	212	std::cout << "#.# ";
185	} else if (q.type == querytype_literal)	213	} else if (q.type == querytype_literal)
186	{	214	{
187	if (q.word.terminating)	215	if (q.word.type == tokentype_hashtag)
		216	{
		217	if (q.word.terminating)
		218	{
		219	std::cout << "#hashtag. ";
		220	} else {
		221	std::cout << "#hashtag ";
		222	}
		223	} else if (q.word.type == tokentype_literal)
188	{	224	{
189	std::cout << q.word.canon << ". ";	225	if (q.word.terminating)
190	} else {	226	{
191	std::cout << q.word.canon << " ";	227	std::cout << q.word.canon << ". ";
		228	} else {
		229	std::cout << q.word.canon << " ";
		230	}
192	}	231	}
193	}	232	}
194	}	233	}
@@ -238,37 +277,47 @@ std::vector<std::string> kgramstats::randomSentence(int n)
238	int max = distribution.rbegin()->first;	277	int max = distribution.rbegin()->first;
239	int r = rand() % max;	278	int r = rand() % max;
240	token_data& next = distribution.upper_bound(r)->second;	279	token_data& next = distribution.upper_bound(r)->second;
241	std::string nextToken(next.word.canon);	280	std::string nextToken;
		281	bool mess = false;
242		282
243	bool mess = (rand() % 100) == 0;	283	if (next.word.type == tokentype_literal)
244	if (mess)
245	{	284	{
246	nextToken = mstats.alternate(nextToken);	285	nextToken = next.word.canon;
247	}
248		286
249	// Determine the casing of the next token. We randomly make the token all	287	mess = (rand() % 100) == 0;
250	// caps based on the markov chain. Otherwise, we check if the previous	288	if (mess)
251	// token is the end of a sentence (terminating token or a wildcard query).	289	{
252	int casing = rand() % next.all;	290	nextToken = mstats.alternate(nextToken);
253	if (casing < next.uppercase)	291	}
254	{	292
255	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);	293	// Determine the casing of the next token. We randomly make the token all
256	} else if ((((cur.rbegin()->type == querytype_sentence)	294	// caps based on the markov chain. Otherwise, we check if the previous
257	\|\| ((cur.rbegin()->type == querytype_literal)	295	// token is the end of a sentence (terminating token or a wildcard query).
258	&& (cur.rbegin()->word.terminating)))	296	int casing = rand() % next.all;
259	&& (rand() % 2 > 0))	297	if (casing < next.uppercase)
260	\|\| (casing - next.uppercase < next.titlecase))	298	{
		299	std::transform(nextToken.begin(), nextToken.end(), nextToken.begin(), ::toupper);
		300	} else if ((((cur.rbegin()->type == querytype_sentence)
		301	\|\| ((cur.rbegin()->type == querytype_literal)
		302	&& (cur.rbegin()->word.terminating)))
		303	&& (rand() % 2 > 0))
		304	\|\| (casing - next.uppercase < next.titlecase))
		305	{
		306	nextToken[0] = toupper(nextToken[0]);
		307	}
		308	} else if (next.word.type == tokentype_hashtag)
261	{	309	{
262	nextToken[0] = toupper(nextToken[0]);	310	int rhash = rand() % hashtags.size();
		311	nextToken = hashtags[rhash];
263	}	312	}
264		313
265	if (next.word.terminating)	314	if (next.word.terminating)
266	{	315	{
267	std::map<int, termstats>& ending = endings[next.word];	316	std::map<int, termstats>& ending = endings[next.word];
268	int emax = ending.rbegin()->first;	317	int emax = ending.rbegin()->first;
269	int er = rand() % emax;	318	int er = rand() % emax;
270	termstats& nextend = ending.upper_bound(er)->second;	319	termstats& nextend = ending.upper_bound(er)->second;
271		320
272	nextToken.append(std::string(nextend.occurrences, nextend.terminator));	321	nextToken.append(std::string(nextend.occurrences, nextend.terminator));
273	}	322	}
274		323


diff --git a/kgramstats.h b/kgramstats.h index ca61df7..ff2fc66 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,19 +7,34 @@
7	#ifndef KGRAMSTATS_H	7	#ifndef KGRAMSTATS_H
8	#define KGRAMSTATS_H	8	#define KGRAMSTATS_H
9		9
		10	enum tokentype {
		11	tokentype_literal,
		12	tokentype_hashtag
		13	};
		14
10	struct token {	15	struct token {
		16	tokentype type;
11	std::string canon;	17	std::string canon;
12	bool terminating;	18	bool terminating;
13		19
14	token(std::string canon) : canon(canon), terminating(false) {}	20	token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
		21	token(tokentype type) : type(type), canon(""), terminating(false) {}
15		22
16	bool operator<(const token& other) const	23	bool operator<(const token& other) const
17	{	24	{
18	if (canon == other.canon)	25	if (type != other.type)
19	{	26	{
20	return !terminating && other.terminating;	27	return type < other.type;
		28	} else if (type == tokentype_literal)
		29	{
		30	if (canon == other.canon)
		31	{
		32	return !terminating && other.terminating;
		33	} else {
		34	return canon < other.canon;
		35	}
21	} else {	36	} else {
22	return canon < other.canon;	37	return !terminating && other.terminating;
23	}	38	}
24	}	39	}
25	};	40	};
@@ -94,6 +109,7 @@ private:
94	std::map<kgram, std::map<int, token_data> > stats;	109	std::map<kgram, std::map<int, token_data> > stats;
95	malaprop mstats;	110	malaprop mstats;
96	std::map<token, std::map<int, termstats> > endings;	111	std::map<token, std::map<int, termstats> > endings;
		112	std::vector<std::string> hashtags;
97	};	113	};
98		114
99	void printKgram(kgram k);	115	void printKgram(kgram k);