diff options
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r-- | kgramstats.cpp | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp new file mode 100644 index 0000000..142b5aa --- /dev/null +++ b/kgramstats.cpp | |||
@@ -0,0 +1,110 @@ | |||
1 | #include "kgramstats.h" | ||
2 | #include <vector> | ||
3 | #include <iostream> | ||
4 | #include <cstdlib> | ||
5 | |||
6 | kgramstats::kgramstats(string corpus, int maxK) | ||
7 | { | ||
8 | this->maxK = maxK; | ||
9 | |||
10 | vector<string> tokens; | ||
11 | int start = 0; | ||
12 | int end = 0; | ||
13 | |||
14 | while (end != string::npos) | ||
15 | { | ||
16 | end = corpus.find(" ", start); | ||
17 | |||
18 | tokens.push_back(corpus.substr(start, (end == string::npos) ? string::npos : end - start)); | ||
19 | |||
20 | start = ((end > (string::npos - 1) ) ? string::npos : end + 1); | ||
21 | } | ||
22 | |||
23 | stats = new map<kgram, map<string, int>* >(); | ||
24 | for (int k=0; k<=maxK; k++) | ||
25 | { | ||
26 | for (int i=0; i<(tokens.size() - k); i++) | ||
27 | { | ||
28 | kgram seq(tokens.begin()+i, tokens.begin()+i+k); | ||
29 | string f = tokens[i+k]; | ||
30 | |||
31 | if ((*stats)[seq] == NULL) | ||
32 | { | ||
33 | (*stats)[seq] = new map<string, int>(); | ||
34 | } | ||
35 | |||
36 | (*((*stats)[seq]))[f]++; | ||
37 | } | ||
38 | } | ||
39 | } | ||
40 | |||
41 | map<string, int>* kgramstats::lookupExts(kgram tk) | ||
42 | { | ||
43 | return (*stats)[tk]; | ||
44 | } | ||
45 | |||
46 | int kgramstats::getMaxK() | ||
47 | { | ||
48 | return maxK; | ||
49 | } | ||
50 | |||
51 | void printKgram(kgram k) | ||
52 | { | ||
53 | for (kgram::iterator it = k.begin(); it != k.end(); it++) | ||
54 | { | ||
55 | cout << *it << " "; | ||
56 | } | ||
57 | cout << endl; | ||
58 | } | ||
59 | |||
60 | vector<string> kgramstats::randomSentence(int n) | ||
61 | { | ||
62 | vector<string> result; | ||
63 | list<string> cur; | ||
64 | |||
65 | for (int i=0; i<n; i++) | ||
66 | { | ||
67 | if ((rand() % 4) != 0) | ||
68 | { | ||
69 | for (int i=0; i<cur.size(); i++) | ||
70 | { | ||
71 | if ((rand() % 3) != 0) | ||
72 | { | ||
73 | cur.pop_front(); | ||
74 | } else { | ||
75 | break; | ||
76 | } | ||
77 | } | ||
78 | } | ||
79 | |||
80 | map<string, int>* probtable = lookupExts(cur); | ||
81 | int max = 0; | ||
82 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | ||
83 | { | ||
84 | max += it->second; | ||
85 | } | ||
86 | |||
87 | int r = rand() % (max+1); | ||
88 | string next = probtable->begin()->first; | ||
89 | for (map<string, int>::iterator it = probtable->begin(); it != probtable->end(); ++it) | ||
90 | { | ||
91 | if (it->second > r) | ||
92 | { | ||
93 | break; | ||
94 | } else { | ||
95 | next = it->first; | ||
96 | r -= it->second; | ||
97 | } | ||
98 | } | ||
99 | |||
100 | if (cur.size() == maxK) | ||
101 | { | ||
102 | cur.pop_front(); | ||
103 | } | ||
104 | |||
105 | cur.push_back(next); | ||
106 | result.push_back(next); | ||
107 | } | ||
108 | |||
109 | return result; | ||
110 | } \ No newline at end of file | ||