Rewrote how tokens are handled

A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags. Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-01-29 12:43:00 -0500
commit: b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree: f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.h
parent: fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download: rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h
index ff2fc66..a97d7bf 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -2,61 +2,89 @@
 #include <map>
 #include <list>
 #include <vector>
-#include "malaprop.h"
+#include "histogram.h"
 #ifndef KGRAMSTATS_H
 #define KGRAMSTATS_H
-enum tokentype {
+struct word {
-  tokentype_literal,
+  std::string canon;
-  tokentype_hashtag
+  histogram<std::string> forms;
+  histogram<std::string> terms;
+  
+  word(std::string canon) : canon(canon) {}
+  
+  bool operator<(const word& other) const
+  {
+    return canon < other.canon;
+  }
 };
-struct token {
+extern word blank_word;
-  tokentype type;
-  std::string canon;
+enum class suffixtype {
-  bool terminating;
+  none,
+  terminating,
+  comma
+};
+enum class parentype {
+  paren,
+  square_bracket,
+  asterisk,
+  quote
+};
+enum class doublestatus {
+  opening,
+  closing,
+  both
+};
+struct delimiter {
+  parentype type;
+  doublestatus status;
+  
+  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
+  bool operator<(const delimiter& other) const
-  token(tokentype type) : type(type), canon(""), terminating(false) {}
+  {
+    return std::tie(type, status) < std::tie(other.type, other.status);
+  }
+};
+struct token {
+  const word& w;
+  std::map<delimiter, int> delimiters;
+  suffixtype suffix;
+  std::string raw;
+    
+  token(const word& w) : w(w), suffix(suffixtype::none) {}
  
  bool operator<(const token& other) const
  {
-    if (type != other.type)
+    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-    {
-      return type < other.type;
-    } else if (type == tokentype_literal)
-    {
-      if (canon == other.canon)
-      {
-        return !terminating && other.terminating;
-      } else {
-        return canon < other.canon;
-      }
-    } else {
-      return !terminating && other.terminating;
-    }
  }
 };
-enum querytype {
+enum class querytype {
-  querytype_literal,
+  literal,
-  querytype_sentence
+  sentence
 };
 struct query {
  querytype type;
-  token word;
+  token tok;
  
-  query(token word) : word(word), type(querytype_literal) {}
+  query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : word(""), type(type) {}
+  query(querytype type) : tok(blank_word), type(type) {}
  
  bool operator<(const query& other) const
  {
    if (type == other.type)
    {
-      return word < other.word;
+      return tok < other.tok;
    } else {
      return type < other.type;
    }
@@ -65,34 +93,11 @@ struct query {
 typedef std::list<query> kgram;
-struct termstats {
-  char terminator;
-  int occurrences;
-  
-  termstats() : terminator('.'), occurrences(1) {}
-  
-  termstats(char terminator, int occurrences)
-  {
-    this->terminator = terminator;
-    this->occurrences = occurrences;
-  }
-  
-  bool operator<(const termstats& other) const
-  {
-    if (terminator == other.terminator)
-    {
-      return occurrences < other.occurrences;
-    } else {
-      return terminator < other.terminator;
-    }
-  }
-};
 class kgramstats
 {
 public:
        kgramstats(std::string corpus, int maxK);
-        std::vector<std::string> randomSentence(int n);
+        std::string randomSentence(int n);
        
 private:
        struct token_data
@@ -100,16 +105,15 @@ private:
                int all;
                int titlecase;
                int uppercase;
-    token word;
+    token tok;
    
-    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
+    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
        };
  
        int maxK;
        std::map<kgram, std::map<int, token_data> > stats;
-  malaprop mstats;
+  word hashtags {"#hashtag"};
-  std::map<token, std::map<int, termstats> > endings;
+  std::map<std::string, word> words;
-  std::vector<std::string> hashtags;
 };
 void printKgram(kgram k);
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-01-29 12:43:00 -0500
commit	b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
tree	f21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.h
parent	fd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
download	rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2 rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip

diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2	#include <map>	2	#include <map>
3	#include <list>	3	#include <list>
4	#include <vector>	4	#include <vector>
5	#include "malaprop.h"	5	#include "histogram.h"
6		6
7	#ifndef KGRAMSTATS_H	7	#ifndef KGRAMSTATS_H
8	#define KGRAMSTATS_H	8	#define KGRAMSTATS_H
9		9
10	enum tokentype {	10	struct word {
11	tokentype_literal,	11	std::string canon;
12	tokentype_hashtag	12	histogram<std::string> forms;
		13	histogram<std::string> terms;
		14
		15	word(std::string canon) : canon(canon) {}
		16
		17	bool operator<(const word& other) const
		18	{
		19	return canon < other.canon;
		20	}
13	};	21	};
14		22
15	struct token {	23	extern word blank_word;
16	tokentype type;	24
17	std::string canon;	25	enum class suffixtype {
18	bool terminating;	26	none,
		27	terminating,
		28	comma
		29	};
		30
		31	enum class parentype {
		32	paren,
		33	square_bracket,
		34	asterisk,
		35	quote
		36	};
		37
		38	enum class doublestatus {
		39	opening,
		40	closing,
		41	both
		42	};
		43
		44	struct delimiter {
		45	parentype type;
		46	doublestatus status;
		47
		48	delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19		49
20	token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}	50	bool operator<(const delimiter& other) const
21	token(tokentype type) : type(type), canon(""), terminating(false) {}	51	{
		52	return std::tie(type, status) < std::tie(other.type, other.status);
		53	}
		54	};
		55
		56	struct token {
		57	const word& w;
		58	std::map<delimiter, int> delimiters;
		59	suffixtype suffix;
		60	std::string raw;
		61
		62	token(const word& w) : w(w), suffix(suffixtype::none) {}
22		63
23	bool operator<(const token& other) const	64	bool operator<(const token& other) const
24	{	65	{
25	if (type != other.type)	66	return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26	{
27	return type < other.type;
28	} else if (type == tokentype_literal)
29	{
30	if (canon == other.canon)
31	{
32	return !terminating && other.terminating;
33	} else {
34	return canon < other.canon;
35	}
36	} else {
37	return !terminating && other.terminating;
38	}
39	}	67	}
40	};	68	};
41		69
42	enum querytype {	70	enum class querytype {
43	querytype_literal,	71	literal,
44	querytype_sentence	72	sentence
45	};	73	};
46		74
47	struct query {	75	struct query {
48	querytype type;	76	querytype type;
49	token word;	77	token tok;
50		78
51	query(token word) : word(word), type(querytype_literal) {}	79	query(token tok) : tok(tok), type(querytype::literal) {}
52		80
53	query(querytype type) : word(""), type(type) {}	81	query(querytype type) : tok(blank_word), type(type) {}
54		82
55	bool operator<(const query& other) const	83	bool operator<(const query& other) const
56	{	84	{
57	if (type == other.type)	85	if (type == other.type)
58	{	86	{
59	return word < other.word;	87	return tok < other.tok;
60	} else {	88	} else {
61	return type < other.type;	89	return type < other.type;
62	}	90	}
@@ -65,34 +93,11 @@ struct query {
65		93
66	typedef std::list<query> kgram;	94	typedef std::list<query> kgram;
67		95
68	struct termstats {
69	char terminator;
70	int occurrences;
71
72	termstats() : terminator('.'), occurrences(1) {}
73
74	termstats(char terminator, int occurrences)
75	{
76	this->terminator = terminator;
77	this->occurrences = occurrences;
78	}
79
80	bool operator<(const termstats& other) const
81	{
82	if (terminator == other.terminator)
83	{
84	return occurrences < other.occurrences;
85	} else {
86	return terminator < other.terminator;
87	}
88	}
89	};
90
91	class kgramstats	96	class kgramstats
92	{	97	{
93	public:	98	public:
94	kgramstats(std::string corpus, int maxK);	99	kgramstats(std::string corpus, int maxK);
95	std::vector<std::string> randomSentence(int n);	100	std::string randomSentence(int n);
96		101
97	private:	102	private:
98	struct token_data	103	struct token_data
@@ -100,16 +105,15 @@ private:
100	int all;	105	int all;
101	int titlecase;	106	int titlecase;
102	int uppercase;	107	int uppercase;
103	token word;	108	token tok;
104		109
105	token_data() : word(""), all(0), titlecase(0), uppercase(0) {}	110	token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106	};	111	};
107		112
108	int maxK;	113	int maxK;
109	std::map<kgram, std::map<int, token_data> > stats;	114	std::map<kgram, std::map<int, token_data> > stats;
110	malaprop mstats;	115	word hashtags {"#hashtag"};
111	std::map<token, std::map<int, termstats> > endings;	116	std::map<std::string, word> words;
112	std::vector<std::string> hashtags;
113	};	117	};
114		118
115	void printKgram(kgram k);	119	void printKgram(kgram k);