1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h
index ff2fc66..a97d7bf 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -2,61 +2,89 @@
 #include <map>
 #include <list>
 #include <vector>
-#include "malaprop.h"
+#include "histogram.h"
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-enum tokentype {
+struct word {
-  tokentype_literal,
+  std::string canon;
-  tokentype_hashtag
+  histogram<std::string> forms;
+  histogram<std::string> terms;
+  
+  word(std::string canon) : canon(canon) {}
+  
+  bool operator<(const word& other) const
+  {
+    return canon < other.canon;
+  }
 };
-struct token {
+extern word blank_word;
-  tokentype type;
-  std::string canon;
+enum class suffixtype {
-  bool terminating;
+  none,
+  terminating,
+  comma
+};
+enum class parentype {
+  paren,
+  square_bracket,
+  asterisk,
+  quote
+};
+enum class doublestatus {
+  opening,
+  closing,
+  both
+};
+struct delimiter {
+  parentype type;
+  doublestatus status;
+  
+  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
+  bool operator<(const delimiter& other) const
-  token(tokentype type) : type(type), canon(""), terminating(false) {}
+  {
+    return std::tie(type, status) < std::tie(other.type, other.status);
+  }
+};
+struct token {
+  const word& w;
+  std::map<delimiter, int> delimiters;
+  suffixtype suffix;
+  std::string raw;
+    
+  token(const word& w) : w(w), suffix(suffixtype::none) {}
  
  bool operator<(const token& other) const
  {
-    if (type != other.type)
+    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-    {
-      return type < other.type;
-    } else if (type == tokentype_literal)
-    {
-      if (canon == other.canon)
-      {
-        return !terminating && other.terminating;
-      } else {
-        return canon < other.canon;
-      }
-    } else {
-      return !terminating && other.terminating;
-    }
  }
 };
-enum querytype {
+enum class querytype {
-  querytype_literal,
+  literal,
-  querytype_sentence
+  sentence
 };
 struct query {
  querytype type;
-  token word;
+  token tok;
  
-  query(token word) : word(word), type(querytype_literal) {}
+  query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : word(""), type(type) {}
+  query(querytype type) : tok(blank_word), type(type) {}
  
  bool operator<(const query& other) const
  {
    if (type == other.type)
    {
-      return word < other.word;
+      return tok < other.tok;
    } else {
      return type < other.type;
    }
@@ -65,34 +93,11 @@ struct query {
 typedef std::list<query> kgram;
-struct termstats {
-  char terminator;
-  int occurrences;
-  
-  termstats() : terminator('.'), occurrences(1) {}
-  
-  termstats(char terminator, int occurrences)
-  {
-    this->terminator = terminator;
-    this->occurrences = occurrences;
-  }
-  
-  bool operator<(const termstats& other) const
-  {
-    if (terminator == other.terminator)
-    {
-      return occurrences < other.occurrences;
-    } else {
-      return terminator < other.terminator;
-    }
-  }
-};
 class kgramstats
 {
 public:
        kgramstats(std::string corpus, int maxK);
-        std::vector<std::string> randomSentence(int n);
+        std::string randomSentence(int n);
        
 private:
        struct token_data
@@ -100,16 +105,15 @@ private:
                int all;
                int titlecase;
                int uppercase;
-    token word;
+    token tok;
    
-    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
+    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
        };
  
        int maxK;
        std::map<kgram, std::map<int, token_data> > stats;
-  malaprop mstats;
+  word hashtags {"#hashtag"};
-  std::map<token, std::map<int, termstats> > endings;
+  std::map<std::string, word> words;
-  std::vector<std::string> hashtags;
 };
 void printKgram(kgram k);

diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2	#include <map>	2	#include <map>
3	#include <list>	3	#include <list>
4	#include <vector>	4	#include <vector>
5	#include "malaprop.h"	5	#include "histogram.h"
6		6
7	#ifndef KGRAMSTATS_H	7	#ifndef KGRAMSTATS_H
8	#define KGRAMSTATS_H	8	#define KGRAMSTATS_H
9		9
10	enum tokentype {	10	struct word {
11	tokentype_literal,	11	std::string canon;
12	tokentype_hashtag	12	histogram<std::string> forms;
		13	histogram<std::string> terms;
		14
		15	word(std::string canon) : canon(canon) {}
		16
		17	bool operator<(const word& other) const
		18	{
		19	return canon < other.canon;
		20	}
13	};	21	};
14		22
15	struct token {	23	extern word blank_word;
16	tokentype type;	24
17	std::string canon;	25	enum class suffixtype {
18	bool terminating;	26	none,
		27	terminating,
		28	comma
		29	};
		30
		31	enum class parentype {
		32	paren,
		33	square_bracket,
		34	asterisk,
		35	quote
		36	};
		37
		38	enum class doublestatus {
		39	opening,
		40	closing,
		41	both
		42	};
		43
		44	struct delimiter {
		45	parentype type;
		46	doublestatus status;
		47
		48	delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19		49
20	token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}	50	bool operator<(const delimiter& other) const
21	token(tokentype type) : type(type), canon(""), terminating(false) {}	51	{
		52	return std::tie(type, status) < std::tie(other.type, other.status);
		53	}
		54	};
		55
		56	struct token {
		57	const word& w;
		58	std::map<delimiter, int> delimiters;
		59	suffixtype suffix;
		60	std::string raw;
		61
		62	token(const word& w) : w(w), suffix(suffixtype::none) {}
22		63
23	bool operator<(const token& other) const	64	bool operator<(const token& other) const
24	{	65	{
25	if (type != other.type)	66	return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26	{
27	return type < other.type;
28	} else if (type == tokentype_literal)
29	{
30	if (canon == other.canon)
31	{
32	return !terminating && other.terminating;
33	} else {
34	return canon < other.canon;
35	}
36	} else {
37	return !terminating && other.terminating;
38	}
39	}	67	}
40	};	68	};
41		69
42	enum querytype {	70	enum class querytype {
43	querytype_literal,	71	literal,
44	querytype_sentence	72	sentence
45	};	73	};
46		74
47	struct query {	75	struct query {
48	querytype type;	76	querytype type;
49	token word;	77	token tok;
50		78
51	query(token word) : word(word), type(querytype_literal) {}	79	query(token tok) : tok(tok), type(querytype::literal) {}
52		80
53	query(querytype type) : word(""), type(type) {}	81	query(querytype type) : tok(blank_word), type(type) {}
54		82
55	bool operator<(const query& other) const	83	bool operator<(const query& other) const
56	{	84	{
57	if (type == other.type)	85	if (type == other.type)
58	{	86	{
59	return word < other.word;	87	return tok < other.tok;
60	} else {	88	} else {
61	return type < other.type;	89	return type < other.type;
62	}	90	}
@@ -65,34 +93,11 @@ struct query {
65		93
66	typedef std::list<query> kgram;	94	typedef std::list<query> kgram;
67		95
68	struct termstats {
69	char terminator;
70	int occurrences;
71
72	termstats() : terminator('.'), occurrences(1) {}
73
74	termstats(char terminator, int occurrences)
75	{
76	this->terminator = terminator;
77	this->occurrences = occurrences;
78	}
79
80	bool operator<(const termstats& other) const
81	{
82	if (terminator == other.terminator)
83	{
84	return occurrences < other.occurrences;
85	} else {
86	return terminator < other.terminator;
87	}
88	}
89	};
90
91	class kgramstats	96	class kgramstats
92	{	97	{
93	public:	98	public:
94	kgramstats(std::string corpus, int maxK);	99	kgramstats(std::string corpus, int maxK);
95	std::vector<std::string> randomSentence(int n);	100	std::string randomSentence(int n);
96		101
97	private:	102	private:
98	struct token_data	103	struct token_data
@@ -100,16 +105,15 @@ private:
100	int all;	105	int all;
101	int titlecase;	106	int titlecase;
102	int uppercase;	107	int uppercase;
103	token word;	108	token tok;
104		109
105	token_data() : word(""), all(0), titlecase(0), uppercase(0) {}	110	token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106	};	111	};
107		112
108	int maxK;	113	int maxK;
109	std::map<kgram, std::map<int, token_data> > stats;	114	std::map<kgram, std::map<int, token_data> > stats;
110	malaprop mstats;	115	word hashtags {"#hashtag"};
111	std::map<token, std::map<int, termstats> > endings;	116	std::map<std::string, word> words;
112	std::vector<std::string> hashtags;
113	};	117	};
114		118
115	void printKgram(kgram k);	119	void printKgram(kgram k);