about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
committerKelly Rauchenberger <fefferburbia@gmail.com>2016-01-29 12:43:00 -0500
commitb316e309559d7176af6cf0bb7dcd6dbaa83c01cd (patch)
treef21bd883ef7c4255a91d096ea105feaad135ee52 /kgramstats.h
parentfd1e9d59694c8a6ba201d2cdffec50f4f590841d (diff)
downloadrawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.gz
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.tar.bz2
rawr-ebooks-b316e309559d7176af6cf0bb7dcd6dbaa83c01cd.zip
Rewrote how tokens are handled
A 'word' is now an object that contains a distribution of forms that word can take. For now, most word just contain one form, the canonical one. The only special use is currently hashtags.

Malapropisms have been disabled because of compatibility issues and because an upcoming feature is planned to replace it.
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h124
1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2#include <map> 2#include <map>
3#include <list> 3#include <list>
4#include <vector> 4#include <vector>
5#include "malaprop.h" 5#include "histogram.h"
6 6
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10enum tokentype { 10struct word {
11 tokentype_literal, 11 std::string canon;
12 tokentype_hashtag 12 histogram<std::string> forms;
13 histogram<std::string> terms;
14
15 word(std::string canon) : canon(canon) {}
16
17 bool operator<(const word& other) const
18 {
19 return canon < other.canon;
20 }
13}; 21};
14 22
15struct token { 23extern word blank_word;
16 tokentype type; 24
17 std::string canon; 25enum class suffixtype {
18 bool terminating; 26 none,
27 terminating,
28 comma
29};
30
31enum class parentype {
32 paren,
33 square_bracket,
34 asterisk,
35 quote
36};
37
38enum class doublestatus {
39 opening,
40 closing,
41 both
42};
43
44struct delimiter {
45 parentype type;
46 doublestatus status;
47
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19 49
20 token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} 50 bool operator<(const delimiter& other) const
21 token(tokentype type) : type(type), canon(""), terminating(false) {} 51 {
52 return std::tie(type, status) < std::tie(other.type, other.status);
53 }
54};
55
56struct token {
57 const word& w;
58 std::map<delimiter, int> delimiters;
59 suffixtype suffix;
60 std::string raw;
61
62 token(const word& w) : w(w), suffix(suffixtype::none) {}
22 63
23 bool operator<(const token& other) const 64 bool operator<(const token& other) const
24 { 65 {
25 if (type != other.type) 66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26 {
27 return type < other.type;
28 } else if (type == tokentype_literal)
29 {
30 if (canon == other.canon)
31 {
32 return !terminating && other.terminating;
33 } else {
34 return canon < other.canon;
35 }
36 } else {
37 return !terminating && other.terminating;
38 }
39 } 67 }
40}; 68};
41 69
42enum querytype { 70enum class querytype {
43 querytype_literal, 71 literal,
44 querytype_sentence 72 sentence
45}; 73};
46 74
47struct query { 75struct query {
48 querytype type; 76 querytype type;
49 token word; 77 token tok;
50 78
51 query(token word) : word(word), type(querytype_literal) {} 79 query(token tok) : tok(tok), type(querytype::literal) {}
52 80
53 query(querytype type) : word(""), type(type) {} 81 query(querytype type) : tok(blank_word), type(type) {}
54 82
55 bool operator<(const query& other) const 83 bool operator<(const query& other) const
56 { 84 {
57 if (type == other.type) 85 if (type == other.type)
58 { 86 {
59 return word < other.word; 87 return tok < other.tok;
60 } else { 88 } else {
61 return type < other.type; 89 return type < other.type;
62 } 90 }
@@ -65,34 +93,11 @@ struct query {
65 93
66typedef std::list<query> kgram; 94typedef std::list<query> kgram;
67 95
68struct termstats {
69 char terminator;
70 int occurrences;
71
72 termstats() : terminator('.'), occurrences(1) {}
73
74 termstats(char terminator, int occurrences)
75 {
76 this->terminator = terminator;
77 this->occurrences = occurrences;
78 }
79
80 bool operator<(const termstats& other) const
81 {
82 if (terminator == other.terminator)
83 {
84 return occurrences < other.occurrences;
85 } else {
86 return terminator < other.terminator;
87 }
88 }
89};
90
91class kgramstats 96class kgramstats
92{ 97{
93public: 98public:
94 kgramstats(std::string corpus, int maxK); 99 kgramstats(std::string corpus, int maxK);
95 std::vector<std::string> randomSentence(int n); 100 std::string randomSentence(int n);
96 101
97private: 102private:
98 struct token_data 103 struct token_data
@@ -100,16 +105,15 @@ private:
100 int all; 105 int all;
101 int titlecase; 106 int titlecase;
102 int uppercase; 107 int uppercase;
103 token word; 108 token tok;
104 109
105 token_data() : word(""), all(0), titlecase(0), uppercase(0) {} 110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106 }; 111 };
107 112
108 int maxK; 113 int maxK;
109 std::map<kgram, std::map<int, token_data> > stats; 114 std::map<kgram, std::map<int, token_data> > stats;
110 malaprop mstats; 115 word hashtags {"#hashtag"};
111 std::map<token, std::map<int, termstats> > endings; 116 std::map<std::string, word> words;
112 std::vector<std::string> hashtags;
113}; 117};
114 118
115void printKgram(kgram k); 119void printKgram(kgram k);