1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
#include <string>
#include <map>
#include <list>
#include <vector>
#include "malaprop.h"
#ifndef KGRAMSTATS_H
#define KGRAMSTATS_H
struct token {
std::string canon;
bool terminating;
token(std::string canon) : canon(canon), terminating(false) {}
bool operator<(const token& other) const
{
if (canon == other.canon)
{
return !terminating && other.terminating;
} else {
return canon < other.canon;
}
}
};
enum querytype {
querytype_literal,
querytype_sentence
};
struct query {
querytype type;
token word;
query(token word) : word(word), type(querytype_literal) {}
query(querytype type) : word(""), type(type) {}
bool operator<(const query& other) const
{
if (type == other.type)
{
return word < other.word;
} else {
return type < other.type;
}
}
};
typedef std::list<query> kgram;
struct termstats {
char terminator;
int occurrences;
termstats() : terminator('.'), occurrences(1) {}
termstats(char terminator, int occurrences)
{
this->terminator = terminator;
this->occurrences = occurrences;
}
bool operator<(const termstats& other) const
{
if (terminator == other.terminator)
{
return occurrences < other.occurrences;
} else {
return terminator < other.terminator;
}
}
};
class kgramstats
{
public:
kgramstats(std::string corpus, int maxK);
std::vector<std::string> randomSentence(int n);
private:
struct token_data
{
int all;
int titlecase;
int uppercase;
token word;
token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
};
int maxK;
std::map<kgram, std::map<int, token_data> > stats;
malaprop mstats;
std::map<token, std::map<int, termstats> > endings;
};
void printKgram(kgram k);
#endif
|