diff options
-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | CMakeLists.txt | 11 | ||||
-rw-r--r-- | dialogue.cpp | 122 | ||||
-rw-r--r-- | get.rb | 28 | ||||
-rw-r--r-- | histogram.cpp | 44 | ||||
-rw-r--r-- | histogram.h | 20 | ||||
-rw-r--r-- | identifier.h | 59 | ||||
-rw-r--r-- | vendor/csv.h | 1268 | ||||
m--------- | vendor/rawr-ebooks | 0 |
10 files changed, 1560 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..934d9ac --- /dev/null +++ b/.gitignore | |||
@@ -0,0 +1,5 @@ | |||
1 | CMakeFiles | ||
2 | CMakeCache.txt | ||
3 | build | ||
4 | cmake_install.cmake | ||
5 | Makefile | ||
diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..57f0e20 --- /dev/null +++ b/.gitmodules | |||
@@ -0,0 +1,3 @@ | |||
1 | [submodule "vendor/rawr-ebooks"] | ||
2 | path = vendor/rawr-ebooks | ||
3 | url = git@github.com:hatkirby/rawr-ebooks | ||
diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9fe3ba2 --- /dev/null +++ b/CMakeLists.txt | |||
@@ -0,0 +1,11 @@ | |||
1 | cmake_minimum_required (VERSION 3.1) | ||
2 | project (rawr-ebooks) | ||
3 | |||
4 | add_subdirectory(vendor/rawr-ebooks) | ||
5 | |||
6 | include_directories(vendor/rawr-ebooks) | ||
7 | |||
8 | add_executable(garnet dialogue.cpp histogram.cpp) | ||
9 | set_property(TARGET garnet PROPERTY CXX_STANDARD 17) | ||
10 | set_property(TARGET garnet PROPERTY CXX_STANDARD_REQUIRED ON) | ||
11 | target_link_libraries(garnet rawr) | ||
diff --git a/dialogue.cpp b/dialogue.cpp new file mode 100644 index 0000000..dd34ee5 --- /dev/null +++ b/dialogue.cpp | |||
@@ -0,0 +1,122 @@ | |||
1 | #include "vendor/csv.h" | ||
2 | #include "identifier.h" | ||
3 | #include "histogram.h" | ||
4 | #include <rawr.h> | ||
5 | #include <cstdlib> | ||
6 | #include <ctime> | ||
7 | #include <map> | ||
8 | #include <string> | ||
9 | #include <iostream> | ||
10 | #include <sstream> | ||
11 | |||
12 | |||
13 | |||
14 | using speakerstore = identifier<std::string>; | ||
15 | using speaker_id = speakerstore::key_type; | ||
16 | |||
17 | |||
18 | struct speaker_data { | ||
19 | |||
20 | std::string name; | ||
21 | histogram<speaker_id> nextSpeaker; | ||
22 | rawr chain; | ||
23 | |||
24 | }; | ||
25 | |||
26 | |||
27 | |||
28 | |||
29 | int main(int, char**) | ||
30 | { | ||
31 | srand(time(NULL)); | ||
32 | rand(); rand(); rand(); rand(); | ||
33 | |||
34 | speakerstore speakers; | ||
35 | std::map<speaker_id, speaker_data> speakerData; | ||
36 | histogram<speaker_id> allSpeakers; | ||
37 | |||
38 | |||
39 | |||
40 | io::CSVReader<2,io::trim_chars<' ', '\t'>,io::double_quote_escape<',', '"'>> in("../dialogue.csv"); | ||
41 | std::string speaker; | ||
42 | std::string line; | ||
43 | |||
44 | bool hadPrev = false; | ||
45 | speaker_id prevSpeaker; | ||
46 | |||
47 | while (in.read_row(speaker, line)) | ||
48 | { | ||
49 | speaker_id spId = speakers.add(speaker); | ||
50 | speaker_data& myData = speakerData[spId]; | ||
51 | myData.name = speaker; | ||
52 | |||
53 | allSpeakers.add(spId); | ||
54 | |||
55 | if (hadPrev && prevSpeaker != spId) | ||
56 | { | ||
57 | speaker_data& psd = speakerData[prevSpeaker]; | ||
58 | psd.nextSpeaker.add(spId); | ||
59 | } | ||
60 | |||
61 | myData.chain.addCorpus(line); | ||
62 | |||
63 | hadPrev = true; | ||
64 | prevSpeaker = spId; | ||
65 | } | ||
66 | |||
67 | for (auto& sp : speakerData) | ||
68 | { | ||
69 | sp.second.chain.compile(4); | ||
70 | sp.second.nextSpeaker.compile(); | ||
71 | } | ||
72 | |||
73 | std::cout << "Speakers:" << std::endl; | ||
74 | for (auto& sp : speakerData) | ||
75 | { | ||
76 | std::cout << " " << sp.second.name << std::endl; | ||
77 | } | ||
78 | std::cout << std::endl; | ||
79 | |||
80 | allSpeakers.compile(); | ||
81 | |||
82 | for (;;) | ||
83 | { | ||
84 | speaker_id curSpeaker = allSpeakers.next(); | ||
85 | |||
86 | std::ostringstream theEnd; | ||
87 | |||
88 | for (int i = 0; i < 5; i++) | ||
89 | { | ||
90 | speaker_data& curSd = speakerData.at(curSpeaker); | ||
91 | |||
92 | //std::ostringstream thisLine; | ||
93 | |||
94 | if (curSd.name != "") | ||
95 | { | ||
96 | theEnd << curSd.name << ": "; | ||
97 | } | ||
98 | |||
99 | theEnd << curSd.chain.randomSentence(1); | ||
100 | |||
101 | /*if (i > 0 && theEnd.str().length() + thisLine.str().length() > 280) | ||
102 | { | ||
103 | break; | ||
104 | }*/ | ||
105 | |||
106 | theEnd << std::endl; | ||
107 | //theEnd << thisLine.str(); | ||
108 | |||
109 | curSpeaker = curSd.nextSpeaker.next(); | ||
110 | } | ||
111 | |||
112 | std::string output = theEnd.str(); | ||
113 | output.resize(280); | ||
114 | output = output.substr(0, output.find_last_of('\n')); | ||
115 | std::cout << output; | ||
116 | |||
117 | std::cout << std::endl; | ||
118 | std::cout << std::endl; | ||
119 | |||
120 | getc(stdin); | ||
121 | } | ||
122 | } | ||
diff --git a/get.rb b/get.rb new file mode 100644 index 0000000..a2a213e --- /dev/null +++ b/get.rb | |||
@@ -0,0 +1,28 @@ | |||
1 | require 'open-uri' | ||
2 | require 'nokogiri' | ||
3 | require 'csv' | ||
4 | |||
5 | result = [] | ||
6 | transcripts = open('https://steven-universe.fandom.com/wiki/Category:Transcripts').read | ||
7 | docTrans = Nokogiri::HTML transcripts | ||
8 | docTrans.css(".category-page__member-link").each do |node| | ||
9 | puts node['href'] | ||
10 | subpage = open("https://steven-universe.fandom.com" + node['href']).read | ||
11 | subpagedoc = Nokogiri::HTML subpage | ||
12 | rows = subpagedoc.css(".bgrevo tr") | ||
13 | rows.shift | ||
14 | rows.pop | ||
15 | rows.each do |row| | ||
16 | if row.children.length == 2 | ||
17 | result << ["", row.children[1].content.strip.gsub(/\n/," ")] | ||
18 | elsif row.children.length == 3 | ||
19 | result << [row.children[1].content.strip, row.children[2].content.strip.gsub(/\n/," ")] | ||
20 | end | ||
21 | end | ||
22 | end | ||
23 | |||
24 | CSV.open("dialogue.csv", "w") do |csv| | ||
25 | result.each do |line| | ||
26 | csv << line | ||
27 | end | ||
28 | end | ||
diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..38fca45 --- /dev/null +++ b/histogram.cpp | |||
@@ -0,0 +1,44 @@ | |||
1 | #include "histogram.h" | ||
2 | #include <cstdlib> | ||
3 | #include <iostream> | ||
4 | |||
5 | template <class T> | ||
6 | void histogram<T>::add(const T& inst) | ||
7 | { | ||
8 | freqtable[inst]++; | ||
9 | } | ||
10 | |||
11 | template <class T> | ||
12 | void histogram<T>::compile() | ||
13 | { | ||
14 | distribution.clear(); | ||
15 | |||
16 | int max = 0; | ||
17 | for (auto& it : freqtable) | ||
18 | { | ||
19 | max += it.second; | ||
20 | distribution.emplace(max, it.first); | ||
21 | } | ||
22 | |||
23 | freqtable.clear(); | ||
24 | } | ||
25 | |||
26 | template <class T> | ||
27 | const T& histogram<T>::next() const | ||
28 | { | ||
29 | int max = distribution.rbegin()->first; | ||
30 | int r = rand() % max; | ||
31 | |||
32 | return distribution.upper_bound(r)->second; | ||
33 | } | ||
34 | |||
35 | template <class T> | ||
36 | void histogram<T>::print() const | ||
37 | { | ||
38 | for (auto& freqpair : freqtable) | ||
39 | { | ||
40 | std::cout << freqpair.first << ": " << freqpair.second << std::endl; | ||
41 | } | ||
42 | } | ||
43 | |||
44 | template class histogram <unsigned long>; | ||
diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..76d8f1b --- /dev/null +++ b/histogram.h | |||
@@ -0,0 +1,20 @@ | |||
1 | #ifndef HISTOGRAM_H_24094D97 | ||
2 | #define HISTOGRAM_H_24094D97 | ||
3 | |||
4 | #include <map> | ||
5 | #include <string> | ||
6 | |||
7 | template <class T> | ||
8 | class histogram { | ||
9 | public: | ||
10 | void add(const T& inst); | ||
11 | void compile(); | ||
12 | const T& next() const; | ||
13 | void print() const; | ||
14 | |||
15 | private: | ||
16 | std::map<T, int> freqtable; | ||
17 | std::map<int, T> distribution; | ||
18 | }; | ||
19 | |||
20 | #endif /* end of include guard: HISTOGRAM_H_24094D97 */ | ||
diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h | |||
@@ -0,0 +1,59 @@ | |||
1 | #ifndef IDENTIFIER_H_D7EE5679 | ||
2 | #define IDENTIFIER_H_D7EE5679 | ||
3 | |||
4 | #include <map> | ||
5 | #include <vector> | ||
6 | |||
7 | template <typename T> | ||
8 | class identifier { | ||
9 | public: | ||
10 | |||
11 | using value_type = T; | ||
12 | |||
13 | private: | ||
14 | |||
15 | using vector_type = std::vector<value_type>; | ||
16 | |||
17 | public: | ||
18 | |||
19 | using key_type = typename vector_type::size_type; | ||
20 | |||
21 | key_type add(const value_type& val) | ||
22 | { | ||
23 | auto it = ids_.find(val); | ||
24 | |||
25 | if (it == std::end(ids_)) | ||
26 | { | ||
27 | key_type ret = ids_.size(); | ||
28 | ids_[val] = ret; | ||
29 | |||
30 | uniq_.push_back(val); | ||
31 | |||
32 | return ret; | ||
33 | } else { | ||
34 | return it->second; | ||
35 | } | ||
36 | } | ||
37 | |||
38 | void compile() | ||
39 | { | ||
40 | ids_.clear(); | ||
41 | } | ||
42 | |||
43 | inline const value_type& get(key_type i) const | ||
44 | { | ||
45 | return uniq_.at(i); | ||
46 | } | ||
47 | |||
48 | inline key_type size() const | ||
49 | { | ||
50 | return uniq_.size(); | ||
51 | } | ||
52 | |||
53 | private: | ||
54 | |||
55 | std::map<value_type, key_type> ids_; | ||
56 | vector_type uniq_; | ||
57 | }; | ||
58 | |||
59 | #endif /* end of include guard: IDENTIFIER_H_D7EE5679 */ | ||
diff --git a/vendor/csv.h b/vendor/csv.h new file mode 100644 index 0000000..93e9034 --- /dev/null +++ b/vendor/csv.h | |||
@@ -0,0 +1,1268 @@ | |||
1 | // Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net> | ||
2 | // License: BSD-3 | ||
3 | // | ||
4 | // All rights reserved. | ||
5 | // | ||
6 | // Redistribution and use in source and binary forms, with or without | ||
7 | // modification, are permitted provided that the following conditions are met: | ||
8 | // | ||
9 | // 1. Redistributions of source code must retain the above copyright notice, | ||
10 | // this list of conditions and the following disclaimer. | ||
11 | // | ||
12 | //2. Redistributions in binary form must reproduce the above copyright notice, | ||
13 | // this list of conditions and the following disclaimer in the documentation | ||
14 | // and/or other materials provided with the distribution. | ||
15 | // | ||
16 | //3. Neither the name of the copyright holder nor the names of its contributors | ||
17 | // may be used to endorse or promote products derived from this software | ||
18 | // without specific prior written permission. | ||
19 | // | ||
20 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | ||
21 | // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | ||
22 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | ||
23 | // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE | ||
24 | // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | ||
25 | // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | ||
26 | // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | ||
27 | // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | ||
28 | // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | ||
29 | // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | ||
30 | // POSSIBILITY OF SUCH DAMAGE. | ||
31 | |||
32 | #ifndef CSV_H | ||
33 | #define CSV_H | ||
34 | |||
35 | #include <vector> | ||
36 | #include <string> | ||
37 | #include <cstring> | ||
38 | #include <algorithm> | ||
39 | #include <utility> | ||
40 | #include <cstdio> | ||
41 | #include <exception> | ||
42 | #ifndef CSV_IO_NO_THREAD | ||
43 | #include <mutex> | ||
44 | #include <thread> | ||
45 | #include <condition_variable> | ||
46 | #endif | ||
47 | #include <memory> | ||
48 | #include <cassert> | ||
49 | #include <cerrno> | ||
50 | #include <istream> | ||
51 | |||
52 | namespace io{ | ||
53 | //////////////////////////////////////////////////////////////////////////// | ||
54 | // LineReader // | ||
55 | //////////////////////////////////////////////////////////////////////////// | ||
56 | |||
57 | namespace error{ | ||
58 | struct base : std::exception{ | ||
59 | virtual void format_error_message()const = 0; | ||
60 | |||
61 | const char*what()const throw(){ | ||
62 | format_error_message(); | ||
63 | return error_message_buffer; | ||
64 | } | ||
65 | |||
66 | mutable char error_message_buffer[512]; | ||
67 | }; | ||
68 | |||
69 | const int max_file_name_length = 255; | ||
70 | |||
71 | struct with_file_name{ | ||
72 | with_file_name(){ | ||
73 | std::memset(file_name, 0, sizeof(file_name)); | ||
74 | } | ||
75 | |||
76 | void set_file_name(const char*file_name){ | ||
77 | if(file_name != nullptr){ | ||
78 | strncpy(this->file_name, file_name, sizeof(this->file_name)); | ||
79 | this->file_name[sizeof(this->file_name)-1] = '\0'; | ||
80 | }else{ | ||
81 | this->file_name[0] = '\0'; | ||
82 | } | ||
83 | } | ||
84 | |||
85 | char file_name[max_file_name_length+1]; | ||
86 | }; | ||
87 | |||
88 | struct with_file_line{ | ||
89 | with_file_line(){ | ||
90 | file_line = -1; | ||
91 | } | ||
92 | |||
93 | void set_file_line(int file_line){ | ||
94 | this->file_line = file_line; | ||
95 | } | ||
96 | |||
97 | int file_line; | ||
98 | }; | ||
99 | |||
100 | struct with_errno{ | ||
101 | with_errno(){ | ||
102 | errno_value = 0; | ||
103 | } | ||
104 | |||
105 | void set_errno(int errno_value){ | ||
106 | this->errno_value = errno_value; | ||
107 | } | ||
108 | |||
109 | int errno_value; | ||
110 | }; | ||
111 | |||
112 | struct can_not_open_file : | ||
113 | base, | ||
114 | with_file_name, | ||
115 | with_errno{ | ||
116 | void format_error_message()const{ | ||
117 | if(errno_value != 0) | ||
118 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
119 | "Can not open file \"%s\" because \"%s\"." | ||
120 | , file_name, std::strerror(errno_value)); | ||
121 | else | ||
122 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
123 | "Can not open file \"%s\"." | ||
124 | , file_name); | ||
125 | } | ||
126 | }; | ||
127 | |||
128 | struct line_length_limit_exceeded : | ||
129 | base, | ||
130 | with_file_name, | ||
131 | with_file_line{ | ||
132 | void format_error_message()const{ | ||
133 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
134 | "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1." | ||
135 | , file_line, file_name); | ||
136 | } | ||
137 | }; | ||
138 | } | ||
139 | |||
140 | class ByteSourceBase{ | ||
141 | public: | ||
142 | virtual int read(char*buffer, int size)=0; | ||
143 | virtual ~ByteSourceBase(){} | ||
144 | }; | ||
145 | |||
146 | namespace detail{ | ||
147 | |||
148 | class OwningStdIOByteSourceBase : public ByteSourceBase{ | ||
149 | public: | ||
150 | explicit OwningStdIOByteSourceBase(FILE*file):file(file){ | ||
151 | // Tell the std library that we want to do the buffering ourself. | ||
152 | std::setvbuf(file, 0, _IONBF, 0); | ||
153 | } | ||
154 | |||
155 | int read(char*buffer, int size){ | ||
156 | return std::fread(buffer, 1, size, file); | ||
157 | } | ||
158 | |||
159 | ~OwningStdIOByteSourceBase(){ | ||
160 | std::fclose(file); | ||
161 | } | ||
162 | |||
163 | private: | ||
164 | FILE*file; | ||
165 | }; | ||
166 | |||
167 | class NonOwningIStreamByteSource : public ByteSourceBase{ | ||
168 | public: | ||
169 | explicit NonOwningIStreamByteSource(std::istream&in):in(in){} | ||
170 | |||
171 | int read(char*buffer, int size){ | ||
172 | in.read(buffer, size); | ||
173 | return in.gcount(); | ||
174 | } | ||
175 | |||
176 | ~NonOwningIStreamByteSource(){} | ||
177 | |||
178 | private: | ||
179 | std::istream∈ | ||
180 | }; | ||
181 | |||
182 | class NonOwningStringByteSource : public ByteSourceBase{ | ||
183 | public: | ||
184 | NonOwningStringByteSource(const char*str, long long size):str(str), remaining_byte_count(size){} | ||
185 | |||
186 | int read(char*buffer, int desired_byte_count){ | ||
187 | int to_copy_byte_count = desired_byte_count; | ||
188 | if(remaining_byte_count < to_copy_byte_count) | ||
189 | to_copy_byte_count = remaining_byte_count; | ||
190 | std::memcpy(buffer, str, to_copy_byte_count); | ||
191 | remaining_byte_count -= to_copy_byte_count; | ||
192 | str += to_copy_byte_count; | ||
193 | return to_copy_byte_count; | ||
194 | } | ||
195 | |||
196 | ~NonOwningStringByteSource(){} | ||
197 | |||
198 | private: | ||
199 | const char*str; | ||
200 | long long remaining_byte_count; | ||
201 | }; | ||
202 | |||
203 | #ifndef CSV_IO_NO_THREAD | ||
204 | class AsynchronousReader{ | ||
205 | public: | ||
206 | void init(std::unique_ptr<ByteSourceBase>arg_byte_source){ | ||
207 | std::unique_lock<std::mutex>guard(lock); | ||
208 | byte_source = std::move(arg_byte_source); | ||
209 | desired_byte_count = -1; | ||
210 | termination_requested = false; | ||
211 | worker = std::thread( | ||
212 | [&]{ | ||
213 | std::unique_lock<std::mutex>guard(lock); | ||
214 | try{ | ||
215 | for(;;){ | ||
216 | read_requested_condition.wait( | ||
217 | guard, | ||
218 | [&]{ | ||
219 | return desired_byte_count != -1 || termination_requested; | ||
220 | } | ||
221 | ); | ||
222 | if(termination_requested) | ||
223 | return; | ||
224 | |||
225 | read_byte_count = byte_source->read(buffer, desired_byte_count); | ||
226 | desired_byte_count = -1; | ||
227 | if(read_byte_count == 0) | ||
228 | break; | ||
229 | read_finished_condition.notify_one(); | ||
230 | } | ||
231 | }catch(...){ | ||
232 | read_error = std::current_exception(); | ||
233 | } | ||
234 | read_finished_condition.notify_one(); | ||
235 | } | ||
236 | ); | ||
237 | } | ||
238 | |||
239 | bool is_valid()const{ | ||
240 | return byte_source != nullptr; | ||
241 | } | ||
242 | |||
243 | void start_read(char*arg_buffer, int arg_desired_byte_count){ | ||
244 | std::unique_lock<std::mutex>guard(lock); | ||
245 | buffer = arg_buffer; | ||
246 | desired_byte_count = arg_desired_byte_count; | ||
247 | read_byte_count = -1; | ||
248 | read_requested_condition.notify_one(); | ||
249 | } | ||
250 | |||
251 | int finish_read(){ | ||
252 | std::unique_lock<std::mutex>guard(lock); | ||
253 | read_finished_condition.wait( | ||
254 | guard, | ||
255 | [&]{ | ||
256 | return read_byte_count != -1 || read_error; | ||
257 | } | ||
258 | ); | ||
259 | if(read_error) | ||
260 | std::rethrow_exception(read_error); | ||
261 | else | ||
262 | return read_byte_count; | ||
263 | } | ||
264 | |||
265 | ~AsynchronousReader(){ | ||
266 | if(byte_source != nullptr){ | ||
267 | { | ||
268 | std::unique_lock<std::mutex>guard(lock); | ||
269 | termination_requested = true; | ||
270 | } | ||
271 | read_requested_condition.notify_one(); | ||
272 | worker.join(); | ||
273 | } | ||
274 | } | ||
275 | |||
276 | private: | ||
277 | std::unique_ptr<ByteSourceBase>byte_source; | ||
278 | |||
279 | std::thread worker; | ||
280 | |||
281 | bool termination_requested; | ||
282 | std::exception_ptr read_error; | ||
283 | char*buffer; | ||
284 | int desired_byte_count; | ||
285 | int read_byte_count; | ||
286 | |||
287 | std::mutex lock; | ||
288 | std::condition_variable read_finished_condition; | ||
289 | std::condition_variable read_requested_condition; | ||
290 | }; | ||
291 | #endif | ||
292 | |||
293 | class SynchronousReader{ | ||
294 | public: | ||
295 | void init(std::unique_ptr<ByteSourceBase>arg_byte_source){ | ||
296 | byte_source = std::move(arg_byte_source); | ||
297 | } | ||
298 | |||
299 | bool is_valid()const{ | ||
300 | return byte_source != nullptr; | ||
301 | } | ||
302 | |||
303 | void start_read(char*arg_buffer, int arg_desired_byte_count){ | ||
304 | buffer = arg_buffer; | ||
305 | desired_byte_count = arg_desired_byte_count; | ||
306 | } | ||
307 | |||
308 | int finish_read(){ | ||
309 | return byte_source->read(buffer, desired_byte_count); | ||
310 | } | ||
311 | private: | ||
312 | std::unique_ptr<ByteSourceBase>byte_source; | ||
313 | char*buffer; | ||
314 | int desired_byte_count; | ||
315 | }; | ||
316 | } | ||
317 | |||
318 | class LineReader{ | ||
319 | private: | ||
320 | static const int block_len = 1<<24; | ||
321 | std::unique_ptr<char[]>buffer; // must be constructed before (and thus destructed after) the reader! | ||
322 | #ifdef CSV_IO_NO_THREAD | ||
323 | detail::SynchronousReader reader; | ||
324 | #else | ||
325 | detail::AsynchronousReader reader; | ||
326 | #endif | ||
327 | int data_begin; | ||
328 | int data_end; | ||
329 | |||
330 | char file_name[error::max_file_name_length+1]; | ||
331 | unsigned file_line; | ||
332 | |||
333 | static std::unique_ptr<ByteSourceBase> open_file(const char*file_name){ | ||
334 | // We open the file in binary mode as it makes no difference under *nix | ||
335 | // and under Windows we handle \r\n newlines ourself. | ||
336 | FILE*file = std::fopen(file_name, "rb"); | ||
337 | if(file == 0){ | ||
338 | int x = errno; // store errno as soon as possible, doing it after constructor call can fail. | ||
339 | error::can_not_open_file err; | ||
340 | err.set_errno(x); | ||
341 | err.set_file_name(file_name); | ||
342 | throw err; | ||
343 | } | ||
344 | return std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file)); | ||
345 | } | ||
346 | |||
347 | void init(std::unique_ptr<ByteSourceBase>byte_source){ | ||
348 | file_line = 0; | ||
349 | |||
350 | buffer = std::unique_ptr<char[]>(new char[3*block_len]); | ||
351 | data_begin = 0; | ||
352 | data_end = byte_source->read(buffer.get(), 2*block_len); | ||
353 | |||
354 | // Ignore UTF-8 BOM | ||
355 | if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') | ||
356 | data_begin = 3; | ||
357 | |||
358 | if(data_end == 2*block_len){ | ||
359 | reader.init(std::move(byte_source)); | ||
360 | reader.start_read(buffer.get() + 2*block_len, block_len); | ||
361 | } | ||
362 | } | ||
363 | |||
364 | public: | ||
365 | LineReader() = delete; | ||
366 | LineReader(const LineReader&) = delete; | ||
367 | LineReader&operator=(const LineReader&) = delete; | ||
368 | |||
369 | explicit LineReader(const char*file_name){ | ||
370 | set_file_name(file_name); | ||
371 | init(open_file(file_name)); | ||
372 | } | ||
373 | |||
374 | explicit LineReader(const std::string&file_name){ | ||
375 | set_file_name(file_name.c_str()); | ||
376 | init(open_file(file_name.c_str())); | ||
377 | } | ||
378 | |||
379 | LineReader(const char*file_name, std::unique_ptr<ByteSourceBase>byte_source){ | ||
380 | set_file_name(file_name); | ||
381 | init(std::move(byte_source)); | ||
382 | } | ||
383 | |||
384 | LineReader(const std::string&file_name, std::unique_ptr<ByteSourceBase>byte_source){ | ||
385 | set_file_name(file_name.c_str()); | ||
386 | init(std::move(byte_source)); | ||
387 | } | ||
388 | |||
389 | LineReader(const char*file_name, const char*data_begin, const char*data_end){ | ||
390 | set_file_name(file_name); | ||
391 | init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); | ||
392 | } | ||
393 | |||
394 | LineReader(const std::string&file_name, const char*data_begin, const char*data_end){ | ||
395 | set_file_name(file_name.c_str()); | ||
396 | init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); | ||
397 | } | ||
398 | |||
399 | LineReader(const char*file_name, FILE*file){ | ||
400 | set_file_name(file_name); | ||
401 | init(std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file))); | ||
402 | } | ||
403 | |||
404 | LineReader(const std::string&file_name, FILE*file){ | ||
405 | set_file_name(file_name.c_str()); | ||
406 | init(std::unique_ptr<ByteSourceBase>(new detail::OwningStdIOByteSourceBase(file))); | ||
407 | } | ||
408 | |||
409 | LineReader(const char*file_name, std::istream&in){ | ||
410 | set_file_name(file_name); | ||
411 | init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningIStreamByteSource(in))); | ||
412 | } | ||
413 | |||
414 | LineReader(const std::string&file_name, std::istream&in){ | ||
415 | set_file_name(file_name.c_str()); | ||
416 | init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningIStreamByteSource(in))); | ||
417 | } | ||
418 | |||
419 | void set_file_name(const std::string&file_name){ | ||
420 | set_file_name(file_name.c_str()); | ||
421 | } | ||
422 | |||
423 | void set_file_name(const char*file_name){ | ||
424 | if(file_name != nullptr){ | ||
425 | strncpy(this->file_name, file_name, sizeof(this->file_name)); | ||
426 | this->file_name[sizeof(this->file_name)-1] = '\0'; | ||
427 | }else{ | ||
428 | this->file_name[0] = '\0'; | ||
429 | } | ||
430 | } | ||
431 | |||
432 | const char*get_truncated_file_name()const{ | ||
433 | return file_name; | ||
434 | } | ||
435 | |||
436 | void set_file_line(unsigned file_line){ | ||
437 | this->file_line = file_line; | ||
438 | } | ||
439 | |||
440 | unsigned get_file_line()const{ | ||
441 | return file_line; | ||
442 | } | ||
443 | |||
444 | char*next_line(){ | ||
445 | if(data_begin == data_end) | ||
446 | return 0; | ||
447 | |||
448 | ++file_line; | ||
449 | |||
450 | assert(data_begin < data_end); | ||
451 | assert(data_end <= block_len*2); | ||
452 | |||
453 | if(data_begin >= block_len){ | ||
454 | std::memcpy(buffer.get(), buffer.get()+block_len, block_len); | ||
455 | data_begin -= block_len; | ||
456 | data_end -= block_len; | ||
457 | if(reader.is_valid()) | ||
458 | { | ||
459 | data_end += reader.finish_read(); | ||
460 | std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); | ||
461 | reader.start_read(buffer.get() + 2*block_len, block_len); | ||
462 | } | ||
463 | } | ||
464 | |||
465 | int line_end = data_begin; | ||
466 | while(buffer[line_end] != '\n' && line_end != data_end){ | ||
467 | ++line_end; | ||
468 | } | ||
469 | |||
470 | if(line_end - data_begin + 1 > block_len){ | ||
471 | error::line_length_limit_exceeded err; | ||
472 | err.set_file_name(file_name); | ||
473 | err.set_file_line(file_line); | ||
474 | throw err; | ||
475 | } | ||
476 | |||
477 | if(buffer[line_end] == '\n' && line_end != data_end){ | ||
478 | buffer[line_end] = '\0'; | ||
479 | }else{ | ||
480 | // some files are missing the newline at the end of the | ||
481 | // last line | ||
482 | ++data_end; | ||
483 | buffer[line_end] = '\0'; | ||
484 | } | ||
485 | |||
486 | // handle windows \r\n-line breaks | ||
487 | if(line_end != data_begin && buffer[line_end-1] == '\r') | ||
488 | buffer[line_end-1] = '\0'; | ||
489 | |||
490 | char*ret = buffer.get() + data_begin; | ||
491 | data_begin = line_end+1; | ||
492 | return ret; | ||
493 | } | ||
494 | }; | ||
495 | |||
496 | |||
497 | //////////////////////////////////////////////////////////////////////////// | ||
498 | // CSV // | ||
499 | //////////////////////////////////////////////////////////////////////////// | ||
500 | |||
501 | namespace error{ | ||
502 | const int max_column_name_length = 63; | ||
503 | struct with_column_name{ | ||
504 | with_column_name(){ | ||
505 | std::memset(column_name, 0, max_column_name_length+1); | ||
506 | } | ||
507 | |||
508 | void set_column_name(const char*column_name){ | ||
509 | if(column_name != nullptr){ | ||
510 | std::strncpy(this->column_name, column_name, max_column_name_length); | ||
511 | this->column_name[max_column_name_length] = '\0'; | ||
512 | }else{ | ||
513 | this->column_name[0] = '\0'; | ||
514 | } | ||
515 | } | ||
516 | |||
517 | char column_name[max_column_name_length+1]; | ||
518 | }; | ||
519 | |||
520 | |||
521 | const int max_column_content_length = 63; | ||
522 | |||
523 | struct with_column_content{ | ||
524 | with_column_content(){ | ||
525 | std::memset(column_content, 0, max_column_content_length+1); | ||
526 | } | ||
527 | |||
528 | void set_column_content(const char*column_content){ | ||
529 | if(column_content != nullptr){ | ||
530 | std::strncpy(this->column_content, column_content, max_column_content_length); | ||
531 | this->column_content[max_column_content_length] = '\0'; | ||
532 | }else{ | ||
533 | this->column_content[0] = '\0'; | ||
534 | } | ||
535 | } | ||
536 | |||
537 | char column_content[max_column_content_length+1]; | ||
538 | }; | ||
539 | |||
540 | |||
541 | struct extra_column_in_header : | ||
542 | base, | ||
543 | with_file_name, | ||
544 | with_column_name{ | ||
545 | void format_error_message()const{ | ||
546 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
547 | "Extra column \"%s\" in header of file \"%s\"." | ||
548 | , column_name, file_name); | ||
549 | } | ||
550 | }; | ||
551 | |||
552 | struct missing_column_in_header : | ||
553 | base, | ||
554 | with_file_name, | ||
555 | with_column_name{ | ||
556 | void format_error_message()const{ | ||
557 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
558 | "Missing column \"%s\" in header of file \"%s\"." | ||
559 | , column_name, file_name); | ||
560 | } | ||
561 | }; | ||
562 | |||
563 | struct duplicated_column_in_header : | ||
564 | base, | ||
565 | with_file_name, | ||
566 | with_column_name{ | ||
567 | void format_error_message()const{ | ||
568 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
569 | "Duplicated column \"%s\" in header of file \"%s\"." | ||
570 | , column_name, file_name); | ||
571 | } | ||
572 | }; | ||
573 | |||
574 | struct header_missing : | ||
575 | base, | ||
576 | with_file_name{ | ||
577 | void format_error_message()const{ | ||
578 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
579 | "Header missing in file \"%s\"." | ||
580 | , file_name); | ||
581 | } | ||
582 | }; | ||
583 | |||
584 | struct too_few_columns : | ||
585 | base, | ||
586 | with_file_name, | ||
587 | with_file_line{ | ||
588 | void format_error_message()const{ | ||
589 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
590 | "Too few columns in line %d in file \"%s\"." | ||
591 | , file_line, file_name); | ||
592 | } | ||
593 | }; | ||
594 | |||
595 | struct too_many_columns : | ||
596 | base, | ||
597 | with_file_name, | ||
598 | with_file_line{ | ||
599 | void format_error_message()const{ | ||
600 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
601 | "Too many columns in line %d in file \"%s\"." | ||
602 | , file_line, file_name); | ||
603 | } | ||
604 | }; | ||
605 | |||
606 | struct escaped_string_not_closed : | ||
607 | base, | ||
608 | with_file_name, | ||
609 | with_file_line{ | ||
610 | void format_error_message()const{ | ||
611 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
612 | "Escaped string was not closed in line %d in file \"%s\"." | ||
613 | , file_line, file_name); | ||
614 | } | ||
615 | }; | ||
616 | |||
617 | struct integer_must_be_positive : | ||
618 | base, | ||
619 | with_file_name, | ||
620 | with_file_line, | ||
621 | with_column_name, | ||
622 | with_column_content{ | ||
623 | void format_error_message()const{ | ||
624 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
625 | "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." | ||
626 | , column_content, column_name, file_name, file_line); | ||
627 | } | ||
628 | }; | ||
629 | |||
630 | struct no_digit : | ||
631 | base, | ||
632 | with_file_name, | ||
633 | with_file_line, | ||
634 | with_column_name, | ||
635 | with_column_content{ | ||
636 | void format_error_message()const{ | ||
637 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
638 | "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." | ||
639 | , column_content, column_name, file_name, file_line); | ||
640 | } | ||
641 | }; | ||
642 | |||
643 | struct integer_overflow : | ||
644 | base, | ||
645 | with_file_name, | ||
646 | with_file_line, | ||
647 | with_column_name, | ||
648 | with_column_content{ | ||
649 | void format_error_message()const{ | ||
650 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
651 | "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." | ||
652 | , column_content, column_name, file_name, file_line); | ||
653 | } | ||
654 | }; | ||
655 | |||
656 | struct integer_underflow : | ||
657 | base, | ||
658 | with_file_name, | ||
659 | with_file_line, | ||
660 | with_column_name, | ||
661 | with_column_content{ | ||
662 | void format_error_message()const{ | ||
663 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
664 | "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." | ||
665 | , column_content, column_name, file_name, file_line); | ||
666 | } | ||
667 | }; | ||
668 | |||
669 | struct invalid_single_character : | ||
670 | base, | ||
671 | with_file_name, | ||
672 | with_file_line, | ||
673 | with_column_name, | ||
674 | with_column_content{ | ||
675 | void format_error_message()const{ | ||
676 | std::snprintf(error_message_buffer, sizeof(error_message_buffer), | ||
677 | "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." | ||
678 | , column_content, column_name, file_name, file_line); | ||
679 | } | ||
680 | }; | ||
681 | } | ||
682 | |||
683 | typedef unsigned ignore_column; | ||
684 | static const ignore_column ignore_no_column = 0; | ||
685 | static const ignore_column ignore_extra_column = 1; | ||
686 | static const ignore_column ignore_missing_column = 2; | ||
687 | |||
688 | template<char ... trim_char_list> | ||
689 | struct trim_chars{ | ||
690 | private: | ||
691 | constexpr static bool is_trim_char(char){ | ||
692 | return false; | ||
693 | } | ||
694 | |||
695 | template<class ...OtherTrimChars> | ||
696 | constexpr static bool is_trim_char(char c, char trim_char, OtherTrimChars...other_trim_chars){ | ||
697 | return c == trim_char || is_trim_char(c, other_trim_chars...); | ||
698 | } | ||
699 | |||
700 | public: | ||
701 | static void trim(char*&str_begin, char*&str_end){ | ||
702 | while(str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) | ||
703 | ++str_begin; | ||
704 | while(str_begin != str_end && is_trim_char(*(str_end-1), trim_char_list...)) | ||
705 | --str_end; | ||
706 | *str_end = '\0'; | ||
707 | } | ||
708 | }; | ||
709 | |||
710 | |||
711 | struct no_comment{ | ||
712 | static bool is_comment(const char*){ | ||
713 | return false; | ||
714 | } | ||
715 | }; | ||
716 | |||
717 | template<char ... comment_start_char_list> | ||
718 | struct single_line_comment{ | ||
719 | private: | ||
720 | constexpr static bool is_comment_start_char(char){ | ||
721 | return false; | ||
722 | } | ||
723 | |||
724 | template<class ...OtherCommentStartChars> | ||
725 | constexpr static bool is_comment_start_char(char c, char comment_start_char, OtherCommentStartChars...other_comment_start_chars){ | ||
726 | return c == comment_start_char || is_comment_start_char(c, other_comment_start_chars...); | ||
727 | } | ||
728 | |||
729 | public: | ||
730 | |||
731 | static bool is_comment(const char*line){ | ||
732 | return is_comment_start_char(*line, comment_start_char_list...); | ||
733 | } | ||
734 | }; | ||
735 | |||
736 | struct empty_line_comment{ | ||
737 | static bool is_comment(const char*line){ | ||
738 | if(*line == '\0') | ||
739 | return true; | ||
740 | while(*line == ' ' || *line == '\t'){ | ||
741 | ++line; | ||
742 | if(*line == 0) | ||
743 | return true; | ||
744 | } | ||
745 | return false; | ||
746 | } | ||
747 | }; | ||
748 | |||
749 | template<char ... comment_start_char_list> | ||
750 | struct single_and_empty_line_comment{ | ||
751 | static bool is_comment(const char*line){ | ||
752 | return single_line_comment<comment_start_char_list...>::is_comment(line) || empty_line_comment::is_comment(line); | ||
753 | } | ||
754 | }; | ||
755 | |||
756 | template<char sep> | ||
757 | struct no_quote_escape{ | ||
758 | static const char*find_next_column_end(const char*col_begin){ | ||
759 | while(*col_begin != sep && *col_begin != '\0') | ||
760 | ++col_begin; | ||
761 | return col_begin; | ||
762 | } | ||
763 | |||
764 | static void unescape(char*&, char*&){ | ||
765 | |||
766 | } | ||
767 | }; | ||
768 | |||
769 | template<char sep, char quote> | ||
770 | struct double_quote_escape{ | ||
771 | static const char*find_next_column_end(const char*col_begin){ | ||
772 | while(*col_begin != sep && *col_begin != '\0') | ||
773 | if(*col_begin != quote) | ||
774 | ++col_begin; | ||
775 | else{ | ||
776 | do{ | ||
777 | ++col_begin; | ||
778 | while(*col_begin != quote){ | ||
779 | if(*col_begin == '\0') | ||
780 | throw error::escaped_string_not_closed(); | ||
781 | ++col_begin; | ||
782 | } | ||
783 | ++col_begin; | ||
784 | }while(*col_begin == quote); | ||
785 | } | ||
786 | return col_begin; | ||
787 | } | ||
788 | |||
789 | static void unescape(char*&col_begin, char*&col_end){ | ||
790 | if(col_end - col_begin >= 2){ | ||
791 | if(*col_begin == quote && *(col_end-1) == quote){ | ||
792 | ++col_begin; | ||
793 | --col_end; | ||
794 | char*out = col_begin; | ||
795 | for(char*in = col_begin; in!=col_end; ++in){ | ||
796 | if(*in == quote && (in+1) != col_end && *(in+1) == quote){ | ||
797 | ++in; | ||
798 | } | ||
799 | *out = *in; | ||
800 | ++out; | ||
801 | } | ||
802 | col_end = out; | ||
803 | *col_end = '\0'; | ||
804 | } | ||
805 | } | ||
806 | |||
807 | } | ||
808 | }; | ||
809 | |||
810 | struct throw_on_overflow{ | ||
811 | template<class T> | ||
812 | static void on_overflow(T&){ | ||
813 | throw error::integer_overflow(); | ||
814 | } | ||
815 | |||
816 | template<class T> | ||
817 | static void on_underflow(T&){ | ||
818 | throw error::integer_underflow(); | ||
819 | } | ||
820 | }; | ||
821 | |||
822 | struct ignore_overflow{ | ||
823 | template<class T> | ||
824 | static void on_overflow(T&){} | ||
825 | |||
826 | template<class T> | ||
827 | static void on_underflow(T&){} | ||
828 | }; | ||
829 | |||
830 | struct set_to_max_on_overflow{ | ||
831 | template<class T> | ||
832 | static void on_overflow(T&x){ | ||
833 | x = std::numeric_limits<T>::max(); | ||
834 | } | ||
835 | |||
836 | template<class T> | ||
837 | static void on_underflow(T&x){ | ||
838 | x = std::numeric_limits<T>::min(); | ||
839 | } | ||
840 | }; | ||
841 | |||
842 | |||
843 | namespace detail{ | ||
844 | template<class quote_policy> | ||
845 | void chop_next_column( | ||
846 | char*&line, char*&col_begin, char*&col_end | ||
847 | ){ | ||
848 | assert(line != nullptr); | ||
849 | |||
850 | col_begin = line; | ||
851 | // the col_begin + (... - col_begin) removes the constness | ||
852 | col_end = col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); | ||
853 | |||
854 | if(*col_end == '\0'){ | ||
855 | line = nullptr; | ||
856 | }else{ | ||
857 | *col_end = '\0'; | ||
858 | line = col_end + 1; | ||
859 | } | ||
860 | } | ||
861 | |||
862 | template<class trim_policy, class quote_policy> | ||
863 | void parse_line( | ||
864 | char*line, | ||
865 | char**sorted_col, | ||
866 | const std::vector<int>&col_order | ||
867 | ){ | ||
868 | for(std::size_t i=0; i<col_order.size(); ++i){ | ||
869 | if(line == nullptr) | ||
870 | throw ::io::error::too_few_columns(); | ||
871 | char*col_begin, *col_end; | ||
872 | chop_next_column<quote_policy>(line, col_begin, col_end); | ||
873 | |||
874 | if(col_order[i] != -1){ | ||
875 | trim_policy::trim(col_begin, col_end); | ||
876 | quote_policy::unescape(col_begin, col_end); | ||
877 | |||
878 | sorted_col[col_order[i]] = col_begin; | ||
879 | } | ||
880 | } | ||
881 | if(line != nullptr) | ||
882 | throw ::io::error::too_many_columns(); | ||
883 | } | ||
884 | |||
885 | template<unsigned column_count, class trim_policy, class quote_policy> | ||
886 | void parse_header_line( | ||
887 | char*line, | ||
888 | std::vector<int>&col_order, | ||
889 | const std::string*col_name, | ||
890 | ignore_column ignore_policy | ||
891 | ){ | ||
892 | col_order.clear(); | ||
893 | |||
894 | bool found[column_count]; | ||
895 | std::fill(found, found + column_count, false); | ||
896 | while(line){ | ||
897 | char*col_begin,*col_end; | ||
898 | chop_next_column<quote_policy>(line, col_begin, col_end); | ||
899 | |||
900 | trim_policy::trim(col_begin, col_end); | ||
901 | quote_policy::unescape(col_begin, col_end); | ||
902 | |||
903 | for(unsigned i=0; i<column_count; ++i) | ||
904 | if(col_begin == col_name[i]){ | ||
905 | if(found[i]){ | ||
906 | error::duplicated_column_in_header err; | ||
907 | err.set_column_name(col_begin); | ||
908 | throw err; | ||
909 | } | ||
910 | found[i] = true; | ||
911 | col_order.push_back(i); | ||
912 | col_begin = 0; | ||
913 | break; | ||
914 | } | ||
915 | if(col_begin){ | ||
916 | if(ignore_policy & ::io::ignore_extra_column) | ||
917 | col_order.push_back(-1); | ||
918 | else{ | ||
919 | error::extra_column_in_header err; | ||
920 | err.set_column_name(col_begin); | ||
921 | throw err; | ||
922 | } | ||
923 | } | ||
924 | } | ||
925 | if(!(ignore_policy & ::io::ignore_missing_column)){ | ||
926 | for(unsigned i=0; i<column_count; ++i){ | ||
927 | if(!found[i]){ | ||
928 | error::missing_column_in_header err; | ||
929 | err.set_column_name(col_name[i].c_str()); | ||
930 | throw err; | ||
931 | } | ||
932 | } | ||
933 | } | ||
934 | } | ||
935 | |||
936 | template<class overflow_policy> | ||
937 | void parse(char*col, char &x){ | ||
938 | if(!*col) | ||
939 | throw error::invalid_single_character(); | ||
940 | x = *col; | ||
941 | ++col; | ||
942 | if(*col) | ||
943 | throw error::invalid_single_character(); | ||
944 | } | ||
945 | |||
946 | template<class overflow_policy> | ||
947 | void parse(char*col, std::string&x){ | ||
948 | x = col; | ||
949 | } | ||
950 | |||
951 | template<class overflow_policy> | ||
952 | void parse(char*col, const char*&x){ | ||
953 | x = col; | ||
954 | } | ||
955 | |||
956 | template<class overflow_policy> | ||
957 | void parse(char*col, char*&x){ | ||
958 | x = col; | ||
959 | } | ||
960 | |||
961 | template<class overflow_policy, class T> | ||
962 | void parse_unsigned_integer(const char*col, T&x){ | ||
963 | x = 0; | ||
964 | while(*col != '\0'){ | ||
965 | if('0' <= *col && *col <= '9'){ | ||
966 | T y = *col - '0'; | ||
967 | if(x > (std::numeric_limits<T>::max()-y)/10){ | ||
968 | overflow_policy::on_overflow(x); | ||
969 | return; | ||
970 | } | ||
971 | x = 10*x+y; | ||
972 | }else | ||
973 | throw error::no_digit(); | ||
974 | ++col; | ||
975 | } | ||
976 | } | ||
977 | |||
978 | template<class overflow_policy>void parse(char*col, unsigned char &x) | ||
979 | {parse_unsigned_integer<overflow_policy>(col, x);} | ||
980 | template<class overflow_policy>void parse(char*col, unsigned short &x) | ||
981 | {parse_unsigned_integer<overflow_policy>(col, x);} | ||
982 | template<class overflow_policy>void parse(char*col, unsigned int &x) | ||
983 | {parse_unsigned_integer<overflow_policy>(col, x);} | ||
984 | template<class overflow_policy>void parse(char*col, unsigned long &x) | ||
985 | {parse_unsigned_integer<overflow_policy>(col, x);} | ||
986 | template<class overflow_policy>void parse(char*col, unsigned long long &x) | ||
987 | {parse_unsigned_integer<overflow_policy>(col, x);} | ||
988 | |||
989 | template<class overflow_policy, class T> | ||
990 | void parse_signed_integer(const char*col, T&x){ | ||
991 | if(*col == '-'){ | ||
992 | ++col; | ||
993 | |||
994 | x = 0; | ||
995 | while(*col != '\0'){ | ||
996 | if('0' <= *col && *col <= '9'){ | ||
997 | T y = *col - '0'; | ||
998 | if(x < (std::numeric_limits<T>::min()+y)/10){ | ||
999 | overflow_policy::on_underflow(x); | ||
1000 | return; | ||
1001 | } | ||
1002 | x = 10*x-y; | ||
1003 | }else | ||
1004 | throw error::no_digit(); | ||
1005 | ++col; | ||
1006 | } | ||
1007 | return; | ||
1008 | }else if(*col == '+') | ||
1009 | ++col; | ||
1010 | parse_unsigned_integer<overflow_policy>(col, x); | ||
1011 | } | ||
1012 | |||
1013 | template<class overflow_policy>void parse(char*col, signed char &x) | ||
1014 | {parse_signed_integer<overflow_policy>(col, x);} | ||
1015 | template<class overflow_policy>void parse(char*col, signed short &x) | ||
1016 | {parse_signed_integer<overflow_policy>(col, x);} | ||
1017 | template<class overflow_policy>void parse(char*col, signed int &x) | ||
1018 | {parse_signed_integer<overflow_policy>(col, x);} | ||
1019 | template<class overflow_policy>void parse(char*col, signed long &x) | ||
1020 | {parse_signed_integer<overflow_policy>(col, x);} | ||
1021 | template<class overflow_policy>void parse(char*col, signed long long &x) | ||
1022 | {parse_signed_integer<overflow_policy>(col, x);} | ||
1023 | |||
1024 | template<class T> | ||
1025 | void parse_float(const char*col, T&x){ | ||
1026 | bool is_neg = false; | ||
1027 | if(*col == '-'){ | ||
1028 | is_neg = true; | ||
1029 | ++col; | ||
1030 | }else if(*col == '+') | ||
1031 | ++col; | ||
1032 | |||
1033 | x = 0; | ||
1034 | while('0' <= *col && *col <= '9'){ | ||
1035 | int y = *col - '0'; | ||
1036 | x *= 10; | ||
1037 | x += y; | ||
1038 | ++col; | ||
1039 | } | ||
1040 | |||
1041 | if(*col == '.'|| *col == ','){ | ||
1042 | ++col; | ||
1043 | T pos = 1; | ||
1044 | while('0' <= *col && *col <= '9'){ | ||
1045 | pos /= 10; | ||
1046 | int y = *col - '0'; | ||
1047 | ++col; | ||
1048 | x += y*pos; | ||
1049 | } | ||
1050 | } | ||
1051 | |||
1052 | if(*col == 'e' || *col == 'E'){ | ||
1053 | ++col; | ||
1054 | int e; | ||
1055 | |||
1056 | parse_signed_integer<set_to_max_on_overflow>(col, e); | ||
1057 | |||
1058 | if(e != 0){ | ||
1059 | T base; | ||
1060 | if(e < 0){ | ||
1061 | base = 0.1; | ||
1062 | e = -e; | ||
1063 | }else{ | ||
1064 | base = 10; | ||
1065 | } | ||
1066 | |||
1067 | while(e != 1){ | ||
1068 | if((e & 1) == 0){ | ||
1069 | base = base*base; | ||
1070 | e >>= 1; | ||
1071 | }else{ | ||
1072 | x *= base; | ||
1073 | --e; | ||
1074 | } | ||
1075 | } | ||
1076 | x *= base; | ||
1077 | } | ||
1078 | }else{ | ||
1079 | if(*col != '\0') | ||
1080 | throw error::no_digit(); | ||
1081 | } | ||
1082 | |||
1083 | if(is_neg) | ||
1084 | x = -x; | ||
1085 | } | ||
1086 | |||
1087 | template<class overflow_policy> void parse(char*col, float&x) { parse_float(col, x); } | ||
1088 | template<class overflow_policy> void parse(char*col, double&x) { parse_float(col, x); } | ||
1089 | template<class overflow_policy> void parse(char*col, long double&x) { parse_float(col, x); } | ||
1090 | |||
1091 | template<class overflow_policy, class T> | ||
1092 | void parse(char*col, T&x){ | ||
1093 | // Mute unused variable compiler warning | ||
1094 | (void)col; | ||
1095 | (void)x; | ||
1096 | // GCC evalutes "false" when reading the template and | ||
1097 | // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why | ||
1098 | // this strange construct is used. | ||
1099 | static_assert(sizeof(T)!=sizeof(T), | ||
1100 | "Can not parse this type. Only buildin integrals, floats, char, char*, const char* and std::string are supported"); | ||
1101 | } | ||
1102 | |||
1103 | } | ||
1104 | |||
1105 | template<unsigned column_count, | ||
1106 | class trim_policy = trim_chars<' ', '\t'>, | ||
1107 | class quote_policy = no_quote_escape<','>, | ||
1108 | class overflow_policy = throw_on_overflow, | ||
1109 | class comment_policy = no_comment | ||
1110 | > | ||
1111 | class CSVReader{ | ||
1112 | private: | ||
1113 | LineReader in; | ||
1114 | |||
1115 | char*row[column_count]; | ||
1116 | std::string column_names[column_count]; | ||
1117 | |||
1118 | std::vector<int>col_order; | ||
1119 | |||
1120 | template<class ...ColNames> | ||
1121 | void set_column_names(std::string s, ColNames...cols){ | ||
1122 | column_names[column_count-sizeof...(ColNames)-1] = std::move(s); | ||
1123 | set_column_names(std::forward<ColNames>(cols)...); | ||
1124 | } | ||
1125 | |||
1126 | void set_column_names(){} | ||
1127 | |||
1128 | |||
1129 | public: | ||
1130 | CSVReader() = delete; | ||
1131 | CSVReader(const CSVReader&) = delete; | ||
1132 | CSVReader&operator=(const CSVReader&); | ||
1133 | |||
1134 | template<class ...Args> | ||
1135 | explicit CSVReader(Args&&...args):in(std::forward<Args>(args)...){ | ||
1136 | std::fill(row, row+column_count, nullptr); | ||
1137 | col_order.resize(column_count); | ||
1138 | for(unsigned i=0; i<column_count; ++i) | ||
1139 | col_order[i] = i; | ||
1140 | for(unsigned i=1; i<=column_count; ++i) | ||
1141 | column_names[i-1] = "col"+std::to_string(i); | ||
1142 | } | ||
1143 | |||
1144 | char*next_line(){ | ||
1145 | return in.next_line(); | ||
1146 | } | ||
1147 | |||
1148 | template<class ...ColNames> | ||
1149 | void read_header(ignore_column ignore_policy, ColNames...cols){ | ||
1150 | static_assert(sizeof...(ColNames)>=column_count, "not enough column names specified"); | ||
1151 | static_assert(sizeof...(ColNames)<=column_count, "too many column names specified"); | ||
1152 | try{ | ||
1153 | set_column_names(std::forward<ColNames>(cols)...); | ||
1154 | |||
1155 | char*line; | ||
1156 | do{ | ||
1157 | line = in.next_line(); | ||
1158 | if(!line) | ||
1159 | throw error::header_missing(); | ||
1160 | }while(comment_policy::is_comment(line)); | ||
1161 | |||
1162 | detail::parse_header_line | ||
1163 | <column_count, trim_policy, quote_policy> | ||
1164 | (line, col_order, column_names, ignore_policy); | ||
1165 | }catch(error::with_file_name&err){ | ||
1166 | err.set_file_name(in.get_truncated_file_name()); | ||
1167 | throw; | ||
1168 | } | ||
1169 | } | ||
1170 | |||
1171 | template<class ...ColNames> | ||
1172 | void set_header(ColNames...cols){ | ||
1173 | static_assert(sizeof...(ColNames)>=column_count, | ||
1174 | "not enough column names specified"); | ||
1175 | static_assert(sizeof...(ColNames)<=column_count, | ||
1176 | "too many column names specified"); | ||
1177 | set_column_names(std::forward<ColNames>(cols)...); | ||
1178 | std::fill(row, row+column_count, nullptr); | ||
1179 | col_order.resize(column_count); | ||
1180 | for(unsigned i=0; i<column_count; ++i) | ||
1181 | col_order[i] = i; | ||
1182 | } | ||
1183 | |||
1184 | bool has_column(const std::string&name) const { | ||
1185 | return col_order.end() != std::find( | ||
1186 | col_order.begin(), col_order.end(), | ||
1187 | std::find(std::begin(column_names), std::end(column_names), name) | ||
1188 | - std::begin(column_names)); | ||
1189 | } | ||
1190 | |||
1191 | void set_file_name(const std::string&file_name){ | ||
1192 | in.set_file_name(file_name); | ||
1193 | } | ||
1194 | |||
1195 | void set_file_name(const char*file_name){ | ||
1196 | in.set_file_name(file_name); | ||
1197 | } | ||
1198 | |||
1199 | const char*get_truncated_file_name()const{ | ||
1200 | return in.get_truncated_file_name(); | ||
1201 | } | ||
1202 | |||
1203 | void set_file_line(unsigned file_line){ | ||
1204 | in.set_file_line(file_line); | ||
1205 | } | ||
1206 | |||
1207 | unsigned get_file_line()const{ | ||
1208 | return in.get_file_line(); | ||
1209 | } | ||
1210 | |||
1211 | private: | ||
1212 | void parse_helper(std::size_t){} | ||
1213 | |||
1214 | template<class T, class ...ColType> | ||
1215 | void parse_helper(std::size_t r, T&t, ColType&...cols){ | ||
1216 | if(row[r]){ | ||
1217 | try{ | ||
1218 | try{ | ||
1219 | ::io::detail::parse<overflow_policy>(row[r], t); | ||
1220 | }catch(error::with_column_content&err){ | ||
1221 | err.set_column_content(row[r]); | ||
1222 | throw; | ||
1223 | } | ||
1224 | }catch(error::with_column_name&err){ | ||
1225 | err.set_column_name(column_names[r].c_str()); | ||
1226 | throw; | ||
1227 | } | ||
1228 | } | ||
1229 | parse_helper(r+1, cols...); | ||
1230 | } | ||
1231 | |||
1232 | |||
1233 | public: | ||
1234 | template<class ...ColType> | ||
1235 | bool read_row(ColType& ...cols){ | ||
1236 | static_assert(sizeof...(ColType)>=column_count, | ||
1237 | "not enough columns specified"); | ||
1238 | static_assert(sizeof...(ColType)<=column_count, | ||
1239 | "too many columns specified"); | ||
1240 | try{ | ||
1241 | try{ | ||
1242 | |||
1243 | char*line; | ||
1244 | do{ | ||
1245 | line = in.next_line(); | ||
1246 | if(!line) | ||
1247 | return false; | ||
1248 | }while(comment_policy::is_comment(line)); | ||
1249 | |||
1250 | detail::parse_line<trim_policy, quote_policy> | ||
1251 | (line, row, col_order); | ||
1252 | |||
1253 | parse_helper(0, cols...); | ||
1254 | }catch(error::with_file_name&err){ | ||
1255 | err.set_file_name(in.get_truncated_file_name()); | ||
1256 | throw; | ||
1257 | } | ||
1258 | }catch(error::with_file_line&err){ | ||
1259 | err.set_file_line(in.get_file_line()); | ||
1260 | throw; | ||
1261 | } | ||
1262 | |||
1263 | return true; | ||
1264 | } | ||
1265 | }; | ||
1266 | } | ||
1267 | #endif | ||
1268 | |||
diff --git a/vendor/rawr-ebooks b/vendor/rawr-ebooks new file mode 160000 | |||
Subproject d75685e69f9a5d3cfc255aa921005fc40ae6e58 | |||