about summary refs log tree commit diff stats
path: root/kgramstats.cpp
diff options
context:
space:
mode:
authorKelly Rauchenberger <fefferburbia@gmail.com>2015-07-24 16:17:42 -0400
committerKelly Rauchenberger <fefferburbia@gmail.com>2015-07-24 16:17:42 -0400
commit1a1d503dbb9530a9d3518deef1ad40727edc1736 (patch)
tree4757b2cd0fd3a979fa1fad590fdbb1bba2ae9a8d /kgramstats.cpp
parentd4db2fd99715dd731327cf485bd219331b6af781 (diff)
downloadrawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.tar.gz
rawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.tar.bz2
rawr-ebooks-1a1d503dbb9530a9d3518deef1ad40727edc1736.zip
Added some newline recognition
Diffstat (limited to 'kgramstats.cpp')
-rw-r--r--kgramstats.cpp86
1 files changed, 55 insertions, 31 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp index b0c3940..16bf598 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -66,7 +66,7 @@ kgramstats::kgramstats(string corpus, int maxK)
66 newSentence = false; 66 newSentence = false;
67 } 67 }
68 68
69 if (newClause) 69 if (newClause || newSentence)
70 { 70 {
71 kgram commaKgram(1, ","); 71 kgram commaKgram(1, ",");
72 if (tstats[commaKgram] == NULL) 72 if (tstats[commaKgram] == NULL)
@@ -78,12 +78,36 @@ kgramstats::kgramstats(string corpus, int maxK)
78 78
79 newClause = false; 79 newClause = false;
80 } 80 }
81 81
82 if ((f.length() > 0) && ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))) 82 if ((f.length() > 0) && (f[f.length()-1] == '\n'))
83 { 83 {
84 td->period++; 84 td->period++;
85 newSentence = true; 85 newSentence = true;
86 } 86 f.resize(f.length()-1);
87 }
88
89 if (f.length() > 0)
90 {
91 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
92 {
93 if (!newSentence)
94 {
95 td->period++;
96 newSentence = true;
97 }
98
99 f.resize(f.length()-1);
100 } else if (f[f.length()-1] == ',')
101 {
102 if (!newSentence)
103 {
104 td->comma++;
105 newClause = true;
106 }
107
108 f.resize(f.length()-1);
109 }
110 }
87 111
88 if (f.length() > 0) 112 if (f.length() > 0)
89 { 113 {
@@ -92,42 +116,42 @@ kgramstats::kgramstats(string corpus, int maxK)
92 td->startquote++; 116 td->startquote++;
93 } 117 }
94 118
95 if (f[f.length()-1] == '"') 119 if (f[0] == '(')
96 { 120 {
97 td->endquote++; 121 td->startparen++;
98
99 if ((f.length() > 1) && (f[f.length()-2] == ','))
100 {
101 td->comma++;
102 newClause = true;
103 }
104 } 122 }
105 123
106 if (f[f.length()-1] == ',') 124 if ((f[f.length()-1] == '"') || (f[f.length()-1] == ')'))
107 { 125 {
108 td->comma++; 126 if (f[f.length()-1] == '"')
109 newClause = true;
110
111 if ((f.length() > 1) && (f[f.length()-2] == '"'))
112 { 127 {
113 td->endquote++; 128 td->endquote++;
129 } else if (f[f.length()-1] == ')')
130 {
131 td->endparen++;
114 } 132 }
115 133
116 if ((f.length() > 1) && (f[f.length()-2] == ')')) 134 f.resize(f.length()-1);
135
136 if (f.length() > 0)
117 { 137 {
118 td->endparen++; 138 if ((f[f.length()-1] == '.') || (f[f.length()-1] == '!') || (f[f.length()-1] == '?'))
139 {
140 if (!newSentence)
141 {
142 td->period++;
143 newSentence = true;
144 }
145 } else if (f[f.length()-1] == ',')
146 {
147 if (!newSentence && !newClause)
148 {
149 td->comma++;
150 newClause = true;
151 }
152 }
119 } 153 }
120 } 154 }
121
122 if (f[0] == '(')
123 {
124 td->startparen++;
125 }
126
127 if (f[f.length()-1] == ')')
128 {
129 td->endparen++;
130 }
131 } 155 }
132 156
133 if (std::find_if(f.begin(), f.end(), ::islower) == f.end()) 157 if (std::find_if(f.begin(), f.end(), ::islower) == f.end())