diff options
-rw-r--r-- | kgramstats.cpp | 86 |
1 files changed, 55 insertions, 31 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0c3940..16bf598 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
@@ -66,7 +66,7 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
66 | newSentence = false; | 66 | newSentence = false; |
67 | } | 67 | } |
68 | 68 | ||
69 | if (newClause) | 69 | if (newClause || newSentence) |
70 | { | 70 | { |
71 | kgram commaKgram(1, ","); | 71 | kgram commaKgram(1, ","); |
72 | if (tstats[commaKgram] == NULL) | 72 | if (tstats[commaKgram] == NULL) |
@@ -78,12 +78,36 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
78 | 78 | ||
79 | newClause = false; | 79 | newClause = false; |
80 | } | 80 | } |
81 | 81 | ||
82 | if ((f.length() > 0) && ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))) | 82 | if ((f.length() > 0) && (f[f.length()-1] == '\n')) |
83 | { | 83 | { |
84 | td->period++; | 84 | td->period++; |
85 | newSentence = true; | 85 | newSentence = true; |
86 | } | 86 | f.resize(f.length()-1); |
87 | } | ||
88 | |||
89 | if (f.length() > 0) | ||
90 | { | ||
91 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) | ||
92 | { | ||
93 | if (!newSentence) | ||
94 | { | ||
95 | td->period++; | ||
96 | newSentence = true; | ||
97 | } | ||
98 | |||
99 | f.resize(f.length()-1); | ||
100 | } else if (f[f.length()-1] == ',') | ||
101 | { | ||
102 | if (!newSentence) | ||
103 | { | ||
104 | td->comma++; | ||
105 | newClause = true; | ||
106 | } | ||
107 | |||
108 | f.resize(f.length()-1); | ||
109 | } | ||
110 | } | ||
87 | 111 | ||
88 | if (f.length() > 0) | 112 | if (f.length() > 0) |
89 | { | 113 | { |
@@ -92,42 +116,42 @@ kgramstats::kgramstats(string corpus, int maxK) | |||
92 | td->startquote++; | 116 | td->startquote++; |
93 | } | 117 | } |
94 | 118 | ||
95 | if (f[f.length()-1] == '"') | 119 | if (f[0] == '(') |
96 | { | 120 | { |
97 | td->endquote++; | 121 | td->startparen++; |
98 | |||
99 | if ((f.length() > 1) && (f[f.length()-2] == ',')) | ||
100 | { | ||
101 | td->comma++; | ||
102 | newClause = true; | ||
103 | } | ||
104 | } | 122 | } |
105 | 123 | ||
106 | if (f[f.length()-1] == ',') | 124 | if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')')) |
107 | { | 125 | { |
108 | td->comma++; | 126 | if (f[f.length()-1] == '"') |
109 | newClause = true; | ||
110 | |||
111 | if ((f.length() > 1) && (f[f.length()-2] == '"')) | ||
112 | { | 127 | { |
113 | td->endquote++; | 128 | td->endquote++; |
129 | } else if (f[f.length()-1] == ')') | ||
130 | { | ||
131 | td->endparen++; | ||
114 | } | 132 | } |
115 | 133 | ||
116 | if ((f.length() > 1) && (f[f.length()-2] == ')')) | 134 | f.resize(f.length()-1); |
135 | |||
136 | if (f.length() > 0) | ||
117 | { | 137 | { |
118 | td->endparen++; | 138 | if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?')) |
139 | { | ||
140 | if (!newSentence) | ||
141 | { | ||
142 | td->period++; | ||
143 | newSentence = true; | ||
144 | } | ||
145 | } else if (f[f.length()-1] == ',') | ||
146 | { | ||
147 | if (!newSentence && !newClause) | ||
148 | { | ||
149 | td->comma++; | ||
150 | newClause = true; | ||
151 | } | ||
152 | } | ||
119 | } | 153 | } |
120 | } | 154 | } |
121 | |||
122 | if (f[0] == '(') | ||
123 | { | ||
124 | td->startparen++; | ||
125 | } | ||
126 | |||
127 | if (f[f.length()-1] == ')') | ||
128 | { | ||
129 | td->endparen++; | ||
130 | } | ||
131 | } | 155 | } |
132 | 156 | ||
133 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) | 157 | if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) |