about summary refs log tree commit diff stats
path: root/kgramstats.h
diff options
context:
space:
mode:
Diffstat (limited to 'kgramstats.h')
-rw-r--r--kgramstats.h84
1 files changed, 73 insertions, 11 deletions
diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -7,7 +7,71 @@
7#ifndef KGRAMSTATS_H 7#ifndef KGRAMSTATS_H
8#define KGRAMSTATS_H 8#define KGRAMSTATS_H
9 9
10typedef std::list<std::string> kgram; 10struct token {
11 std::string canon;
12 bool terminating;
13
14 token(std::string canon) : canon(canon), terminating(false) {}
15
16 bool operator<(const token& other) const
17 {
18 if (canon == other.canon)
19 {
20 return !terminating && other.terminating;
21 } else {
22 return canon < other.canon;
23 }
24 }
25};
26
27enum querytype {
28 querytype_literal,
29 querytype_sentence
30};
31
32struct query {
33 querytype type;
34 token word;
35
36 query(token word) : word(word), type(querytype_literal) {}
37
38 query(querytype type) : word(""), type(type) {}
39
40 bool operator<(const query& other) const
41 {
42 if (type == other.type)
43 {
44 return word < other.word;
45 } else {
46 return type < other.type;
47 }
48 }
49};
50
51typedef std::list<query> kgram;
52
53struct termstats {
54 char terminator;
55 int occurrences;
56
57 termstats() : terminator('.'), occurrences(1) {}
58
59 termstats(char terminator, int occurrences)
60 {
61 this->terminator = terminator;
62 this->occurrences = occurrences;
63 }
64
65 bool operator<(const termstats& other) const
66 {
67 if (terminator == other.terminator)
68 {
69 return occurrences < other.occurrences;
70 } else {
71 return terminator < other.terminator;
72 }
73 }
74};
11 75
12class kgramstats 76class kgramstats
13{ 77{
@@ -16,22 +80,20 @@ public:
16 std::vector<std::string> randomSentence(int n); 80 std::vector<std::string> randomSentence(int n);
17 81
18private: 82private:
19 typedef struct 83 struct token_data
20 { 84 {
21 int all; 85 int all;
22 int titlecase; 86 int titlecase;
23 int uppercase; 87 int uppercase;
24 int period; 88 token word;
25 int startquote; 89
26 int endquote; 90 token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
27 int startparen; 91 };
28 int endparen; 92
29 int comma;
30 std::string* token;
31 } token_data;
32 int maxK; 93 int maxK;
33 std::map<kgram, std::map<int, token_data*>* >* stats; 94 std::map<kgram, std::map<int, token_data> > stats;
34 malaprop mstats; 95 malaprop mstats;
96 std::map<token, std::map<int, termstats> > endings;
35}; 97};
36 98
37void printKgram(kgram k); 99void printKgram(kgram k);