Pulled the ebooks functionality out into a library

author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-05-20 23:14:06 -0400
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-05-20 23:15:10 -0400
commit: 8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree: 0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.h
parent: a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
download: rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2
rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip
1 files changed, 106 insertions, 95 deletions
diff --git a/kgramstats.h b/kgramstats.h
index 5fad37d..ee75ada 100644
--- a/kgramstats.h
+++ b/kgramstats.h

@@ -1,124 +1,135 @@
+#ifndef KGRAMSTATS_H
+#define KGRAMSTATS_H
 #include <string>
 #include <map>
 #include <list>
 #include <vector>
 #include "histogram.h"
+#include <functional>
-#ifndef KGRAMSTATS_H
+class rawr {
-#define KGRAMSTATS_H
+  public:
+    typedef std::function<std::string(std::string, std::string)> transform_callback;
-struct word {
+    
-  std::string canon;
+    void addCorpus(std::string corpus);
-  histogram<std::string> forms;
+    void compile(int maxK);
-  histogram<std::string> terms;
+    
+    void setTransformCallback(transform_callback _arg);
+        std::string randomSentence(int maxL);
+        
+  private:
+    struct word {
+      std::string canon;
+      histogram<std::string> forms;
+      histogram<std::string> terms;
  
-  word(std::string canon) : canon(canon) {}
+      word(std::string canon) : canon(canon) {}
  
-  bool operator<(const word& other) const
+      bool operator<(const word& other) const
-  {
+      {
-    return canon < other.canon;
+        return canon < other.canon;
-  }
+      }
-};
+    };
-extern word blank_word;
-enum class suffixtype {
+    enum class suffixtype {
-  none,
+      none,
-  terminating,
+      terminating,
-  comma
+      comma
-};
+    };
-enum class parentype {
+    enum class parentype {
-  paren,
+      paren,
-  square_bracket,
+      square_bracket,
-  asterisk,
+      asterisk,
-  quote
+      quote
-};
+    };
-enum class doublestatus {
+    enum class doublestatus {
-  opening,
+      opening,
-  closing,
+      closing,
-  both
+      both
-};
+    };
-struct delimiter {
+    struct delimiter {
-  parentype type;
+      parentype type;
-  doublestatus status;
+      doublestatus status;
  
-  delimiter(parentype type, doublestatus status) : type(type), status(status) {}
+      delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
-  bool operator<(const delimiter& other) const
+      bool operator<(const delimiter& other) const
-  {
+      {
-    return std::tie(type, status) < std::tie(other.type, other.status);
+        return std::tie(type, status) < std::tie(other.type, other.status);
-  }
+      }
-};
+    };
-struct token {
+    struct token {
-  const word& w;
+      const word& w;
-  std::map<delimiter, int> delimiters;
+      std::map<delimiter, int> delimiters;
-  suffixtype suffix;
+      suffixtype suffix;
-  std::string raw;
+      std::string raw;
    
-  token(const word& w) : w(w), suffix(suffixtype::none) {}
+      token(const word& w) : w(w), suffix(suffixtype::none) {}
  
-  bool operator<(const token& other) const
+      bool operator<(const token& other) const
-  {
+      {
-    return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
+        return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
-  }
+      }
-};
+    };
-enum class querytype {
+    enum class querytype {
-  literal,
+      literal,
-  sentence
+      sentence
-};
+    };
-struct query {
+    struct query {
-  querytype type;
+      querytype type;
-  token tok;
+      token tok;
  
-  query(token tok) : tok(tok), type(querytype::literal) {}
+      query(token tok) : tok(tok), type(querytype::literal) {}
  
-  query(querytype type) : tok(blank_word), type(type) {}
+      query(querytype type) : tok(blank_word), type(type) {}
  
-  bool operator<(const query& other) const
+      bool operator<(const query& other) const
-  {
+      {
-    if (type == other.type)
+        if (type == other.type)
-    {
+        {
-      return tok < other.tok;
+          return tok < other.tok;
-    } else {
+        } else {
-      return type < other.type;
+          return type < other.type;
-    }
+        }
-  }
+      }
-};
+    };
+    
-typedef std::list<query> kgram;
+    static const query wildcardQuery;
+    static const word blank_word;
-class kgramstats
+    typedef std::list<query> kgram;
-{
-public:
-        kgramstats(std::string corpus, int maxK);
-        std::string randomSentence(int maxL);
-        
-private:
-        struct token_data
-        {
-                int all;
-                int titlecase;
-                int uppercase;
-    token tok;
    
-    token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
+        struct token_data
-        };
+        {
+                int all;
+                int titlecase;
+                int uppercase;
+      token tok;
+    
+      token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
+        };
+    
+    friend std::ostream& operator<<(std::ostream& os, kgram k);
+    friend std::ostream& operator<<(std::ostream& os, query q);
+    friend std::ostream& operator<<(std::ostream& os, token t);
  
-        int maxK;
+        int _maxK;
-        std::map<kgram, std::map<int, token_data> > stats;
+    bool _compiled = false; 
+    std::vector<std::string> _corpora;
+        std::map<kgram, std::map<int, token_data>> _stats;
+    transform_callback _transform;
  
-  // Words
+    // Words
-  std::map<std::string, word> words;
+    std::map<std::string, word> words;
-  word hashtags {"#hashtag"};
+    word hashtags {"#hashtag"};
-  word emoticons {"👌"};
+    word emoticons {"👌"};
 };
-void printKgram(kgram k);
 #endif
 \ No newline at end of file
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-05-20 23:14:06 -0400
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-05-20 23:15:10 -0400
commit	8c3022e759191e90b5e12bcb6b0b5a6a48b37840 (patch)
tree	0d9a8a12616d6ea335fdc687049b05f679e8ccc6 /kgramstats.h
parent	a9c391efd5f0f73b5374dcfd807cdf59ed663e6b (diff)
download	rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.gz rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.tar.bz2 rawr-ebooks-8c3022e759191e90b5e12bcb6b0b5a6a48b37840.zip

diff --git a/kgramstats.h b/kgramstats.h index 5fad37d..ee75ada 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -1,124 +1,135 @@
		1	#ifndef KGRAMSTATS_H
		2	#define KGRAMSTATS_H
		3
1	#include <string>	4	#include <string>
2	#include <map>	5	#include <map>
3	#include <list>	6	#include <list>
4	#include <vector>	7	#include <vector>
5	#include "histogram.h"	8	#include "histogram.h"
		9	#include <functional>
6		10
7	#ifndef KGRAMSTATS_H	11	class rawr {
8	#define KGRAMSTATS_H	12	public:
9		13	typedef std::function<std::string(std::string, std::string)> transform_callback;
10	struct word {	14
11	std::string canon;	15	void addCorpus(std::string corpus);
12	histogram<std::string> forms;	16	void compile(int maxK);
13	histogram<std::string> terms;	17
		18	void setTransformCallback(transform_callback _arg);
		19	std::string randomSentence(int maxL);
		20
		21	private:
		22	struct word {
		23	std::string canon;
		24	histogram<std::string> forms;
		25	histogram<std::string> terms;
14		26
15	word(std::string canon) : canon(canon) {}	27	word(std::string canon) : canon(canon) {}
16		28
17	bool operator<(const word& other) const	29	bool operator<(const word& other) const
18	{	30	{
19	return canon < other.canon;	31	return canon < other.canon;
20	}	32	}
21	};	33	};
22
23	extern word blank_word;
24		34
25	enum class suffixtype {	35	enum class suffixtype {
26	none,	36	none,
27	terminating,	37	terminating,
28	comma	38	comma
29	};	39	};
30		40
31	enum class parentype {	41	enum class parentype {
32	paren,	42	paren,
33	square_bracket,	43	square_bracket,
34	asterisk,	44	asterisk,
35	quote	45	quote
36	};	46	};
37		47
38	enum class doublestatus {	48	enum class doublestatus {
39	opening,	49	opening,
40	closing,	50	closing,
41	both	51	both
42	};	52	};
43		53
44	struct delimiter {	54	struct delimiter {
45	parentype type;	55	parentype type;
46	doublestatus status;	56	doublestatus status;
47		57
48	delimiter(parentype type, doublestatus status) : type(type), status(status) {}	58	delimiter(parentype type, doublestatus status) : type(type), status(status) {}
49		59
50	bool operator<(const delimiter& other) const	60	bool operator<(const delimiter& other) const
51	{	61	{
52	return std::tie(type, status) < std::tie(other.type, other.status);	62	return std::tie(type, status) < std::tie(other.type, other.status);
53	}	63	}
54	};	64	};
55		65
56	struct token {	66	struct token {
57	const word& w;	67	const word& w;
58	std::map<delimiter, int> delimiters;	68	std::map<delimiter, int> delimiters;
59	suffixtype suffix;	69	suffixtype suffix;
60	std::string raw;	70	std::string raw;
61		71
62	token(const word& w) : w(w), suffix(suffixtype::none) {}	72	token(const word& w) : w(w), suffix(suffixtype::none) {}
63		73
64	bool operator<(const token& other) const	74	bool operator<(const token& other) const
65	{	75	{
66	return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);	76	return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
67	}	77	}
68	};	78	};
69		79
70	enum class querytype {	80	enum class querytype {
71	literal,	81	literal,
72	sentence	82	sentence
73	};	83	};
74		84
75	struct query {	85	struct query {
76	querytype type;	86	querytype type;
77	token tok;	87	token tok;
78		88
79	query(token tok) : tok(tok), type(querytype::literal) {}	89	query(token tok) : tok(tok), type(querytype::literal) {}
80		90
81	query(querytype type) : tok(blank_word), type(type) {}	91	query(querytype type) : tok(blank_word), type(type) {}
82		92
83	bool operator<(const query& other) const	93	bool operator<(const query& other) const
84	{	94	{
85	if (type == other.type)	95	if (type == other.type)
86	{	96	{
87	return tok < other.tok;	97	return tok < other.tok;
88	} else {	98	} else {
89	return type < other.type;	99	return type < other.type;
90	}	100	}
91	}	101	}
92	};	102	};
93		103
94	typedef std::list<query> kgram;	104	static const query wildcardQuery;
		105	static const word blank_word;
95		106
96	class kgramstats	107	typedef std::list<query> kgram;
97	{
98	public:
99	kgramstats(std::string corpus, int maxK);
100	std::string randomSentence(int maxL);
101
102	private:
103	struct token_data
104	{
105	int all;
106	int titlecase;
107	int uppercase;
108	token tok;
109		108
110	token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}	109	struct token_data
111	};	110	{
		111	int all;
		112	int titlecase;
		113	int uppercase;
		114	token tok;
		115
		116	token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
		117	};
		118
		119	friend std::ostream& operator<<(std::ostream& os, kgram k);
		120	friend std::ostream& operator<<(std::ostream& os, query q);
		121	friend std::ostream& operator<<(std::ostream& os, token t);
112		122
113	int maxK;	123	int _maxK;
114	std::map<kgram, std::map<int, token_data> > stats;	124	bool _compiled = false;
		125	std::vector<std::string> _corpora;
		126	std::map<kgram, std::map<int, token_data>> _stats;
		127	transform_callback _transform;
115		128
116	// Words	129	// Words
117	std::map<std::string, word> words;	130	std::map<std::string, word> words;
118	word hashtags {"#hashtag"};	131	word hashtags {"#hashtag"};
119	word emoticons {"👌"};	132	word emoticons {"👌"};
120	};	133	};
121		134
122	void printKgram(kgram k);
123
124	#endif \ No newline at end of file	135	#endif \ No newline at end of file