diff options
author | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-04 23:16:17 -0500 |
---|---|---|
committer | Kelly Rauchenberger <fefferburbia@gmail.com> | 2016-01-04 23:16:17 -0500 |
commit | 9e89002477d1358de9be9cabdc1edba26bd32836 (patch) | |
tree | 9afb52740fe4f618105d014a816df26b36ed83f6 /kgramstats.h | |
parent | 0a5c6bd740aff9be53e7ef117e9e926fde3c289e (diff) | |
download | rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.gz rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.tar.bz2 rawr-ebooks-9e89002477d1358de9be9cabdc1edba26bd32836.zip |
Rewrote quite a bit of kgramstats
The algorithm still treats most tokens literally, but now groups together tokens that terminate a clause somehow (so, contain .?!,), without distinguishing between the different terminating characters. For each word that can terminate a sentence, the algorithm creates a histogram of the terminating characters and number of occurrences of those characters for that word (number of occurrences is to allow things like um???? and um,,,,, to still be folded down into um.). Then, when the terminating version of that token is invoked, a random terminating string is added to that token based on the histogram for that word (again, to allow things like the desu-ly use of multiple commas to end clauses). The algorithm now also has a slightly advanced kgram structure; a special "sentence wildcard" kgram value is set aside from normal strings of tokens that can match any terminating token. This kgram value is never printed (it is only ever present in the query kgrams and cannot actually be present in the histograms (it is of a different datatype)) and is used at the beginning of sentence generation to make sure that the first couple of words generated actually form the beginning of a sentence instead of picking up somewhere in the middle of a sentence. It is also used to reset sentence generation in the rare occasion that the end of the corpus is reached.
Diffstat (limited to 'kgramstats.h')
-rw-r--r-- | kgramstats.h | 84 |
1 files changed, 73 insertions, 11 deletions
diff --git a/kgramstats.h b/kgramstats.h index b01dece..ca61df7 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
@@ -7,7 +7,71 @@ | |||
7 | #ifndef KGRAMSTATS_H | 7 | #ifndef KGRAMSTATS_H |
8 | #define KGRAMSTATS_H | 8 | #define KGRAMSTATS_H |
9 | 9 | ||
10 | typedef std::list<std::string> kgram; | 10 | struct token { |
11 | std::string canon; | ||
12 | bool terminating; | ||
13 | |||
14 | token(std::string canon) : canon(canon), terminating(false) {} | ||
15 | |||
16 | bool operator<(const token& other) const | ||
17 | { | ||
18 | if (canon == other.canon) | ||
19 | { | ||
20 | return !terminating && other.terminating; | ||
21 | } else { | ||
22 | return canon < other.canon; | ||
23 | } | ||
24 | } | ||
25 | }; | ||
26 | |||
27 | enum querytype { | ||
28 | querytype_literal, | ||
29 | querytype_sentence | ||
30 | }; | ||
31 | |||
32 | struct query { | ||
33 | querytype type; | ||
34 | token word; | ||
35 | |||
36 | query(token word) : word(word), type(querytype_literal) {} | ||
37 | |||
38 | query(querytype type) : word(""), type(type) {} | ||
39 | |||
40 | bool operator<(const query& other) const | ||
41 | { | ||
42 | if (type == other.type) | ||
43 | { | ||
44 | return word < other.word; | ||
45 | } else { | ||
46 | return type < other.type; | ||
47 | } | ||
48 | } | ||
49 | }; | ||
50 | |||
51 | typedef std::list<query> kgram; | ||
52 | |||
53 | struct termstats { | ||
54 | char terminator; | ||
55 | int occurrences; | ||
56 | |||
57 | termstats() : terminator('.'), occurrences(1) {} | ||
58 | |||
59 | termstats(char terminator, int occurrences) | ||
60 | { | ||
61 | this->terminator = terminator; | ||
62 | this->occurrences = occurrences; | ||
63 | } | ||
64 | |||
65 | bool operator<(const termstats& other) const | ||
66 | { | ||
67 | if (terminator == other.terminator) | ||
68 | { | ||
69 | return occurrences < other.occurrences; | ||
70 | } else { | ||
71 | return terminator < other.terminator; | ||
72 | } | ||
73 | } | ||
74 | }; | ||
11 | 75 | ||
12 | class kgramstats | 76 | class kgramstats |
13 | { | 77 | { |
@@ -16,22 +80,20 @@ public: | |||
16 | std::vector<std::string> randomSentence(int n); | 80 | std::vector<std::string> randomSentence(int n); |
17 | 81 | ||
18 | private: | 82 | private: |
19 | typedef struct | 83 | struct token_data |
20 | { | 84 | { |
21 | int all; | 85 | int all; |
22 | int titlecase; | 86 | int titlecase; |
23 | int uppercase; | 87 | int uppercase; |
24 | int period; | 88 | token word; |
25 | int startquote; | 89 | |
26 | int endquote; | 90 | token_data() : word(""), all(0), titlecase(0), uppercase(0) {} |
27 | int startparen; | 91 | }; |
28 | int endparen; | 92 | |
29 | int comma; | ||
30 | std::string* token; | ||
31 | } token_data; | ||
32 | int maxK; | 93 | int maxK; |
33 | std::map<kgram, std::map<int, token_data*>* >* stats; | 94 | std::map<kgram, std::map<int, token_data> > stats; |
34 | malaprop mstats; | 95 | malaprop mstats; |
96 | std::map<token, std::map<int, termstats> > endings; | ||
35 | }; | 97 | }; |
36 | 98 | ||
37 | void printKgram(kgram k); | 99 | void printKgram(kgram k); |