about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h124
1 files changed, 64 insertions, 60 deletions
diff --git a/kgramstats.h b/kgramstats.h index ff2fc66..a97d7bf 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -2,61 +2,89 @@
2#include <map> 2#include <map>
3#include <list> 3#include <list>
4#include <vector> 4#include <vector>
5#include "malaprop.h" 5#include "histogram.h"
6 6
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10enum tokentype { 10struct word {
11 tokentype_literal, 11 std::string canon;
12 tokentype_hashtag 12 histogram<std::string> forms;
13 histogram<std::string> terms;
14
15 word(std::string canon) : canon(canon) {}
16
17 bool operator<(const word& other) const
18 {
19 return canon < other.canon;
20 }
13}; 21};
14 22
15struct token { 23extern word blank_word;
16 tokentype type; 24
17 std::string canon; 25enum class suffixtype {
18 bool terminating; 26 none,
27 terminating,
28 comma
29};
30
31enum class parentype {
32 paren,
33 square_bracket,
34 asterisk,
35 quote
36};
37
38enum class doublestatus {
39 opening,
40 closing,
41 both
42};
43
44struct delimiter {
45 parentype type;
46 doublestatus status;
47
48 delimiter(parentype type, doublestatus status) : type(type), status(status) {}
19 49
20 token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {} 50 bool operator<(const delimiter& other) const
21 token(tokentype type) : type(type), canon(""), terminating(false) {} 51 {
52 return std::tie(type, status) < std::tie(other.type, other.status);
53 }
54};
55
56struct token {
57 const word& w;
58 std::map<delimiter, int> delimiters;
59 suffixtype suffix;
60 std::string raw;
61
62 token(const word& w) : w(w), suffix(suffixtype::none) {}
22 63
23 bool operator<(const token& other) const 64 bool operator<(const token& other) const
24 { 65 {
25 if (type != other.type) 66 return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
26 {
27 return type < other.type;
28 } else if (type == tokentype_literal)
29 {
30 if (canon == other.canon)
31 {
32 return !terminating && other.terminating;
33 } else {
34 return canon < other.canon;
35 }
36 } else {
37 return !terminating && other.terminating;
38 }
39 } 67 }
40}; 68};
41 69
42enum querytype { 70enum class querytype {
43 querytype_literal, 71 literal,
44 querytype_sentence 72 sentence
45}; 73};
46 74
47struct query { 75struct query {
48 querytype type; 76 querytype type;
49 token word; 77 token tok;
50 78
51 query(token word) : word(word), type(querytype_literal) {} 79 query(token tok) : tok(tok), type(querytype::literal) {}
52 80
53 query(querytype type) : word(""), type(type) {} 81 query(querytype type) : tok(blank_word), type(type) {}
54 82
55 bool operator<(const query& other) const 83 bool operator<(const query& other) const
56 { 84 {
57 if (type == other.type) 85 if (type == other.type)
58 { 86 {
59 return word < other.word; 87 return tok < other.tok;
60 } else { 88 } else {
61 return type < other.type; 89 return type < other.type;
62 } 90 }
@@ -65,34 +93,11 @@ struct query {
65 93
66typedef std::list<query> kgram; 94typedef std::list<query> kgram;
67 95
68struct termstats {
69 char terminator;
70 int occurrences;
71
72 termstats() : terminator('.'), occurrences(1) {}
73
74 termstats(char terminator, int occurrences)
75 {
76 this->terminator = terminator;
77 this->occurrences = occurrences;
78 }
79
80 bool operator<(const termstats& other) const
81 {
82 if (terminator == other.terminator)
83 {
84 return occurrences < other.occurrences;
85 } else {
86 return terminator < other.terminator;
87 }
88 }
89};
90
91class kgramstats 96class kgramstats
92{ 97{
93public: 98public:
94 kgramstats(std::string corpus, int maxK); 99 kgramstats(std::string corpus, int maxK);
95 std::vector<std::string> randomSentence(int n); 100 std::string randomSentence(int n);
96 101
97private: 102private:
98 struct token_data 103 struct token_data
@@ -100,16 +105,15 @@ private:
100 int all; 105 int all;
101 int titlecase; 106 int titlecase;
102 int uppercase; 107 int uppercase;
103 token word; 108 token tok;
104 109
105 token_data() : word(""), all(0), titlecase(0), uppercase(0) {} 110 token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
106 }; 111 };
107 112
108 int maxK; 113 int maxK;
109 std::map<kgram, std::map<int, token_data> > stats; 114 std::map<kgram, std::map<int, token_data> > stats;
110 malaprop mstats; 115 word hashtags {"#hashtag"};
111 std::map<token, std::map<int, termstats> > endings; 116 std::map<std::string, word> words;
112 std::vector<std::string> hashtags;
113}; 117};
114 118
115void printKgram(kgram k); 119void printKgram(kgram k);