From 2b152d09881559a0330b3ff923e03e715777c6c3 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Wed, 27 Feb 2019 20:45:17 -0500 Subject: Initial commit (by Pink!) --- .gitignore | 5 + .gitmodules | 3 + CMakeLists.txt | 11 + dialogue.cpp | 122 +++++ get.rb | 28 ++ histogram.cpp | 44 ++ histogram.h | 20 + identifier.h | 59 +++ vendor/csv.h | 1268 ++++++++++++++++++++++++++++++++++++++++++++++++++++ vendor/rawr-ebooks | 1 + 10 files changed, 1561 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 CMakeLists.txt create mode 100644 dialogue.cpp create mode 100644 get.rb create mode 100644 histogram.cpp create mode 100644 histogram.h create mode 100644 identifier.h create mode 100644 vendor/csv.h create mode 160000 vendor/rawr-ebooks diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..934d9ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +CMakeFiles +CMakeCache.txt +build +cmake_install.cmake +Makefile diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..57f0e20 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vendor/rawr-ebooks"] + path = vendor/rawr-ebooks + url = git@github.com:hatkirby/rawr-ebooks diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..9fe3ba2 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,11 @@ +cmake_minimum_required (VERSION 3.1) +project (rawr-ebooks) + +add_subdirectory(vendor/rawr-ebooks) + +include_directories(vendor/rawr-ebooks) + +add_executable(garnet dialogue.cpp histogram.cpp) +set_property(TARGET garnet PROPERTY CXX_STANDARD 17) +set_property(TARGET garnet PROPERTY CXX_STANDARD_REQUIRED ON) +target_link_libraries(garnet rawr) diff --git a/dialogue.cpp b/dialogue.cpp new file mode 100644 index 0000000..dd34ee5 --- /dev/null +++ b/dialogue.cpp @@ -0,0 +1,122 @@ +#include "vendor/csv.h" +#include "identifier.h" +#include "histogram.h" +#include +#include +#include +#include +#include +#include +#include + + + +using speakerstore = identifier; +using speaker_id = speakerstore::key_type; + + +struct speaker_data { + + std::string name; + histogram nextSpeaker; + rawr chain; + +}; + + + + +int main(int, char**) +{ + srand(time(NULL)); + rand(); rand(); rand(); rand(); + + speakerstore speakers; + std::map speakerData; + histogram allSpeakers; + + + + io::CSVReader<2,io::trim_chars<' ', '\t'>,io::double_quote_escape<',', '"'>> in("../dialogue.csv"); + std::string speaker; + std::string line; + + bool hadPrev = false; + speaker_id prevSpeaker; + + while (in.read_row(speaker, line)) + { + speaker_id spId = speakers.add(speaker); + speaker_data& myData = speakerData[spId]; + myData.name = speaker; + + allSpeakers.add(spId); + + if (hadPrev && prevSpeaker != spId) + { + speaker_data& psd = speakerData[prevSpeaker]; + psd.nextSpeaker.add(spId); + } + + myData.chain.addCorpus(line); + + hadPrev = true; + prevSpeaker = spId; + } + + for (auto& sp : speakerData) + { + sp.second.chain.compile(4); + sp.second.nextSpeaker.compile(); + } + + std::cout << "Speakers:" << std::endl; + for (auto& sp : speakerData) + { + std::cout << " " << sp.second.name << std::endl; + } + std::cout << std::endl; + + allSpeakers.compile(); + + for (;;) + { + speaker_id curSpeaker = allSpeakers.next(); + + std::ostringstream theEnd; + + for (int i = 0; i < 5; i++) + { + speaker_data& curSd = speakerData.at(curSpeaker); + + //std::ostringstream thisLine; + + if (curSd.name != "") + { + theEnd << curSd.name << ": "; + } + + theEnd << curSd.chain.randomSentence(1); + + /*if (i > 0 && theEnd.str().length() + thisLine.str().length() > 280) + { + break; + }*/ + + theEnd << std::endl; + //theEnd << thisLine.str(); + + curSpeaker = curSd.nextSpeaker.next(); + } + + std::string output = theEnd.str(); + output.resize(280); + output = output.substr(0, output.find_last_of('\n')); + std::cout << output; + + std::cout << std::endl; + std::cout << std::endl; + + getc(stdin); + } +} diff --git a/get.rb b/get.rb new file mode 100644 index 0000000..a2a213e --- /dev/null +++ b/get.rb @@ -0,0 +1,28 @@ +require 'open-uri' +require 'nokogiri' +require 'csv' + +result = [] +transcripts = open('https://steven-universe.fandom.com/wiki/Category:Transcripts').read +docTrans = Nokogiri::HTML transcripts +docTrans.css(".category-page__member-link").each do |node| + puts node['href'] + subpage = open("https://steven-universe.fandom.com" + node['href']).read + subpagedoc = Nokogiri::HTML subpage + rows = subpagedoc.css(".bgrevo tr") + rows.shift + rows.pop + rows.each do |row| + if row.children.length == 2 + result << ["", row.children[1].content.strip.gsub(/\n/," ")] + elsif row.children.length == 3 + result << [row.children[1].content.strip, row.children[2].content.strip.gsub(/\n/," ")] + end + end +end + +CSV.open("dialogue.csv", "w") do |csv| + result.each do |line| + csv << line + end +end diff --git a/histogram.cpp b/histogram.cpp new file mode 100644 index 0000000..38fca45 --- /dev/null +++ b/histogram.cpp @@ -0,0 +1,44 @@ +#include "histogram.h" +#include +#include + +template +void histogram::add(const T& inst) +{ + freqtable[inst]++; +} + +template +void histogram::compile() +{ + distribution.clear(); + + int max = 0; + for (auto& it : freqtable) + { + max += it.second; + distribution.emplace(max, it.first); + } + + freqtable.clear(); +} + +template +const T& histogram::next() const +{ + int max = distribution.rbegin()->first; + int r = rand() % max; + + return distribution.upper_bound(r)->second; +} + +template +void histogram::print() const +{ + for (auto& freqpair : freqtable) + { + std::cout << freqpair.first << ": " << freqpair.second << std::endl; + } +} + +template class histogram ; diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000..76d8f1b --- /dev/null +++ b/histogram.h @@ -0,0 +1,20 @@ +#ifndef HISTOGRAM_H_24094D97 +#define HISTOGRAM_H_24094D97 + +#include +#include + +template +class histogram { + public: + void add(const T& inst); + void compile(); + const T& next() const; + void print() const; + + private: + std::map freqtable; + std::map distribution; +}; + +#endif /* end of include guard: HISTOGRAM_H_24094D97 */ diff --git a/identifier.h b/identifier.h new file mode 100644 index 0000000..74d83ce --- /dev/null +++ b/identifier.h @@ -0,0 +1,59 @@ +#ifndef IDENTIFIER_H_D7EE5679 +#define IDENTIFIER_H_D7EE5679 + +#include +#include + +template +class identifier { +public: + + using value_type = T; + +private: + + using vector_type = std::vector; + +public: + + using key_type = typename vector_type::size_type; + + key_type add(const value_type& val) + { + auto it = ids_.find(val); + + if (it == std::end(ids_)) + { + key_type ret = ids_.size(); + ids_[val] = ret; + + uniq_.push_back(val); + + return ret; + } else { + return it->second; + } + } + + void compile() + { + ids_.clear(); + } + + inline const value_type& get(key_type i) const + { + return uniq_.at(i); + } + + inline key_type size() const + { + return uniq_.size(); + } + +private: + + std::map ids_; + vector_type uniq_; +}; + +#endif /* end of include guard: IDENTIFIER_H_D7EE5679 */ diff --git a/vendor/csv.h b/vendor/csv.h new file mode 100644 index 0000000..93e9034 --- /dev/null +++ b/vendor/csv.h @@ -0,0 +1,1268 @@ +// Copyright: (2012-2015) Ben Strasser +// License: BSD-3 +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +//2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +//3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. + +#ifndef CSV_H +#define CSV_H + +#include +#include +#include +#include +#include +#include +#include +#ifndef CSV_IO_NO_THREAD +#include +#include +#include +#endif +#include +#include +#include +#include + +namespace io{ + //////////////////////////////////////////////////////////////////////////// + // LineReader // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + struct base : std::exception{ + virtual void format_error_message()const = 0; + + const char*what()const throw(){ + format_error_message(); + return error_message_buffer; + } + + mutable char error_message_buffer[512]; + }; + + const int max_file_name_length = 255; + + struct with_file_name{ + with_file_name(){ + std::memset(file_name, 0, sizeof(file_name)); + } + + void set_file_name(const char*file_name){ + if(file_name != nullptr){ + strncpy(this->file_name, file_name, sizeof(this->file_name)); + this->file_name[sizeof(this->file_name)-1] = '\0'; + }else{ + this->file_name[0] = '\0'; + } + } + + char file_name[max_file_name_length+1]; + }; + + struct with_file_line{ + with_file_line(){ + file_line = -1; + } + + void set_file_line(int file_line){ + this->file_line = file_line; + } + + int file_line; + }; + + struct with_errno{ + with_errno(){ + errno_value = 0; + } + + void set_errno(int errno_value){ + this->errno_value = errno_value; + } + + int errno_value; + }; + + struct can_not_open_file : + base, + with_file_name, + with_errno{ + void format_error_message()const{ + if(errno_value != 0) + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\" because \"%s\"." + , file_name, std::strerror(errno_value)); + else + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Can not open file \"%s\"." + , file_name); + } + }; + + struct line_length_limit_exceeded : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1." + , file_line, file_name); + } + }; + } + + class ByteSourceBase{ + public: + virtual int read(char*buffer, int size)=0; + virtual ~ByteSourceBase(){} + }; + + namespace detail{ + + class OwningStdIOByteSourceBase : public ByteSourceBase{ + public: + explicit OwningStdIOByteSourceBase(FILE*file):file(file){ + // Tell the std library that we want to do the buffering ourself. + std::setvbuf(file, 0, _IONBF, 0); + } + + int read(char*buffer, int size){ + return std::fread(buffer, 1, size, file); + } + + ~OwningStdIOByteSourceBase(){ + std::fclose(file); + } + + private: + FILE*file; + }; + + class NonOwningIStreamByteSource : public ByteSourceBase{ + public: + explicit NonOwningIStreamByteSource(std::istream&in):in(in){} + + int read(char*buffer, int size){ + in.read(buffer, size); + return in.gcount(); + } + + ~NonOwningIStreamByteSource(){} + + private: + std::istream∈ + }; + + class NonOwningStringByteSource : public ByteSourceBase{ + public: + NonOwningStringByteSource(const char*str, long long size):str(str), remaining_byte_count(size){} + + int read(char*buffer, int desired_byte_count){ + int to_copy_byte_count = desired_byte_count; + if(remaining_byte_count < to_copy_byte_count) + to_copy_byte_count = remaining_byte_count; + std::memcpy(buffer, str, to_copy_byte_count); + remaining_byte_count -= to_copy_byte_count; + str += to_copy_byte_count; + return to_copy_byte_count; + } + + ~NonOwningStringByteSource(){} + + private: + const char*str; + long long remaining_byte_count; + }; + + #ifndef CSV_IO_NO_THREAD + class AsynchronousReader{ + public: + void init(std::unique_ptrarg_byte_source){ + std::unique_lockguard(lock); + byte_source = std::move(arg_byte_source); + desired_byte_count = -1; + termination_requested = false; + worker = std::thread( + [&]{ + std::unique_lockguard(lock); + try{ + for(;;){ + read_requested_condition.wait( + guard, + [&]{ + return desired_byte_count != -1 || termination_requested; + } + ); + if(termination_requested) + return; + + read_byte_count = byte_source->read(buffer, desired_byte_count); + desired_byte_count = -1; + if(read_byte_count == 0) + break; + read_finished_condition.notify_one(); + } + }catch(...){ + read_error = std::current_exception(); + } + read_finished_condition.notify_one(); + } + ); + } + + bool is_valid()const{ + return byte_source != nullptr; + } + + void start_read(char*arg_buffer, int arg_desired_byte_count){ + std::unique_lockguard(lock); + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + read_byte_count = -1; + read_requested_condition.notify_one(); + } + + int finish_read(){ + std::unique_lockguard(lock); + read_finished_condition.wait( + guard, + [&]{ + return read_byte_count != -1 || read_error; + } + ); + if(read_error) + std::rethrow_exception(read_error); + else + return read_byte_count; + } + + ~AsynchronousReader(){ + if(byte_source != nullptr){ + { + std::unique_lockguard(lock); + termination_requested = true; + } + read_requested_condition.notify_one(); + worker.join(); + } + } + + private: + std::unique_ptrbyte_source; + + std::thread worker; + + bool termination_requested; + std::exception_ptr read_error; + char*buffer; + int desired_byte_count; + int read_byte_count; + + std::mutex lock; + std::condition_variable read_finished_condition; + std::condition_variable read_requested_condition; + }; + #endif + + class SynchronousReader{ + public: + void init(std::unique_ptrarg_byte_source){ + byte_source = std::move(arg_byte_source); + } + + bool is_valid()const{ + return byte_source != nullptr; + } + + void start_read(char*arg_buffer, int arg_desired_byte_count){ + buffer = arg_buffer; + desired_byte_count = arg_desired_byte_count; + } + + int finish_read(){ + return byte_source->read(buffer, desired_byte_count); + } + private: + std::unique_ptrbyte_source; + char*buffer; + int desired_byte_count; + }; + } + + class LineReader{ + private: + static const int block_len = 1<<24; + std::unique_ptrbuffer; // must be constructed before (and thus destructed after) the reader! + #ifdef CSV_IO_NO_THREAD + detail::SynchronousReader reader; + #else + detail::AsynchronousReader reader; + #endif + int data_begin; + int data_end; + + char file_name[error::max_file_name_length+1]; + unsigned file_line; + + static std::unique_ptr open_file(const char*file_name){ + // We open the file in binary mode as it makes no difference under *nix + // and under Windows we handle \r\n newlines ourself. + FILE*file = std::fopen(file_name, "rb"); + if(file == 0){ + int x = errno; // store errno as soon as possible, doing it after constructor call can fail. + error::can_not_open_file err; + err.set_errno(x); + err.set_file_name(file_name); + throw err; + } + return std::unique_ptr(new detail::OwningStdIOByteSourceBase(file)); + } + + void init(std::unique_ptrbyte_source){ + file_line = 0; + + buffer = std::unique_ptr(new char[3*block_len]); + data_begin = 0; + data_end = byte_source->read(buffer.get(), 2*block_len); + + // Ignore UTF-8 BOM + if(data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' && buffer[2] == '\xBF') + data_begin = 3; + + if(data_end == 2*block_len){ + reader.init(std::move(byte_source)); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + public: + LineReader() = delete; + LineReader(const LineReader&) = delete; + LineReader&operator=(const LineReader&) = delete; + + explicit LineReader(const char*file_name){ + set_file_name(file_name); + init(open_file(file_name)); + } + + explicit LineReader(const std::string&file_name){ + set_file_name(file_name.c_str()); + init(open_file(file_name.c_str())); + } + + LineReader(const char*file_name, std::unique_ptrbyte_source){ + set_file_name(file_name); + init(std::move(byte_source)); + } + + LineReader(const std::string&file_name, std::unique_ptrbyte_source){ + set_file_name(file_name.c_str()); + init(std::move(byte_source)); + } + + LineReader(const char*file_name, const char*data_begin, const char*data_end){ + set_file_name(file_name); + init(std::unique_ptr(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + LineReader(const std::string&file_name, const char*data_begin, const char*data_end){ + set_file_name(file_name.c_str()); + init(std::unique_ptr(new detail::NonOwningStringByteSource(data_begin, data_end-data_begin))); + } + + LineReader(const char*file_name, FILE*file){ + set_file_name(file_name); + init(std::unique_ptr(new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const std::string&file_name, FILE*file){ + set_file_name(file_name.c_str()); + init(std::unique_ptr(new detail::OwningStdIOByteSourceBase(file))); + } + + LineReader(const char*file_name, std::istream&in){ + set_file_name(file_name); + init(std::unique_ptr(new detail::NonOwningIStreamByteSource(in))); + } + + LineReader(const std::string&file_name, std::istream&in){ + set_file_name(file_name.c_str()); + init(std::unique_ptr(new detail::NonOwningIStreamByteSource(in))); + } + + void set_file_name(const std::string&file_name){ + set_file_name(file_name.c_str()); + } + + void set_file_name(const char*file_name){ + if(file_name != nullptr){ + strncpy(this->file_name, file_name, sizeof(this->file_name)); + this->file_name[sizeof(this->file_name)-1] = '\0'; + }else{ + this->file_name[0] = '\0'; + } + } + + const char*get_truncated_file_name()const{ + return file_name; + } + + void set_file_line(unsigned file_line){ + this->file_line = file_line; + } + + unsigned get_file_line()const{ + return file_line; + } + + char*next_line(){ + if(data_begin == data_end) + return 0; + + ++file_line; + + assert(data_begin < data_end); + assert(data_end <= block_len*2); + + if(data_begin >= block_len){ + std::memcpy(buffer.get(), buffer.get()+block_len, block_len); + data_begin -= block_len; + data_end -= block_len; + if(reader.is_valid()) + { + data_end += reader.finish_read(); + std::memcpy(buffer.get()+block_len, buffer.get()+2*block_len, block_len); + reader.start_read(buffer.get() + 2*block_len, block_len); + } + } + + int line_end = data_begin; + while(buffer[line_end] != '\n' && line_end != data_end){ + ++line_end; + } + + if(line_end - data_begin + 1 > block_len){ + error::line_length_limit_exceeded err; + err.set_file_name(file_name); + err.set_file_line(file_line); + throw err; + } + + if(buffer[line_end] == '\n' && line_end != data_end){ + buffer[line_end] = '\0'; + }else{ + // some files are missing the newline at the end of the + // last line + ++data_end; + buffer[line_end] = '\0'; + } + + // handle windows \r\n-line breaks + if(line_end != data_begin && buffer[line_end-1] == '\r') + buffer[line_end-1] = '\0'; + + char*ret = buffer.get() + data_begin; + data_begin = line_end+1; + return ret; + } + }; + + + //////////////////////////////////////////////////////////////////////////// + // CSV // + //////////////////////////////////////////////////////////////////////////// + + namespace error{ + const int max_column_name_length = 63; + struct with_column_name{ + with_column_name(){ + std::memset(column_name, 0, max_column_name_length+1); + } + + void set_column_name(const char*column_name){ + if(column_name != nullptr){ + std::strncpy(this->column_name, column_name, max_column_name_length); + this->column_name[max_column_name_length] = '\0'; + }else{ + this->column_name[0] = '\0'; + } + } + + char column_name[max_column_name_length+1]; + }; + + + const int max_column_content_length = 63; + + struct with_column_content{ + with_column_content(){ + std::memset(column_content, 0, max_column_content_length+1); + } + + void set_column_content(const char*column_content){ + if(column_content != nullptr){ + std::strncpy(this->column_content, column_content, max_column_content_length); + this->column_content[max_column_content_length] = '\0'; + }else{ + this->column_content[0] = '\0'; + } + } + + char column_content[max_column_content_length+1]; + }; + + + struct extra_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Extra column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct missing_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Missing column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct duplicated_column_in_header : + base, + with_file_name, + with_column_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Duplicated column \"%s\" in header of file \"%s\"." + , column_name, file_name); + } + }; + + struct header_missing : + base, + with_file_name{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Header missing in file \"%s\"." + , file_name); + } + }; + + struct too_few_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too few columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct too_many_columns : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Too many columns in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct escaped_string_not_closed : + base, + with_file_name, + with_file_line{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "Escaped string was not closed in line %d in file \"%s\"." + , file_line, file_name); + } + }; + + struct integer_must_be_positive : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" must be positive or 0 in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct no_digit : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" contains an invalid digit in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_overflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" overflows in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct integer_underflow : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The integer \"%s\" underflows in column \"%s\" in file \"%s\" in line \"%d\"." + , column_content, column_name, file_name, file_line); + } + }; + + struct invalid_single_character : + base, + with_file_name, + with_file_line, + with_column_name, + with_column_content{ + void format_error_message()const{ + std::snprintf(error_message_buffer, sizeof(error_message_buffer), + "The content \"%s\" of column \"%s\" in file \"%s\" in line \"%d\" is not a single character." + , column_content, column_name, file_name, file_line); + } + }; + } + + typedef unsigned ignore_column; + static const ignore_column ignore_no_column = 0; + static const ignore_column ignore_extra_column = 1; + static const ignore_column ignore_missing_column = 2; + + template + struct trim_chars{ + private: + constexpr static bool is_trim_char(char){ + return false; + } + + template + constexpr static bool is_trim_char(char c, char trim_char, OtherTrimChars...other_trim_chars){ + return c == trim_char || is_trim_char(c, other_trim_chars...); + } + + public: + static void trim(char*&str_begin, char*&str_end){ + while(str_begin != str_end && is_trim_char(*str_begin, trim_char_list...)) + ++str_begin; + while(str_begin != str_end && is_trim_char(*(str_end-1), trim_char_list...)) + --str_end; + *str_end = '\0'; + } + }; + + + struct no_comment{ + static bool is_comment(const char*){ + return false; + } + }; + + template + struct single_line_comment{ + private: + constexpr static bool is_comment_start_char(char){ + return false; + } + + template + constexpr static bool is_comment_start_char(char c, char comment_start_char, OtherCommentStartChars...other_comment_start_chars){ + return c == comment_start_char || is_comment_start_char(c, other_comment_start_chars...); + } + + public: + + static bool is_comment(const char*line){ + return is_comment_start_char(*line, comment_start_char_list...); + } + }; + + struct empty_line_comment{ + static bool is_comment(const char*line){ + if(*line == '\0') + return true; + while(*line == ' ' || *line == '\t'){ + ++line; + if(*line == 0) + return true; + } + return false; + } + }; + + template + struct single_and_empty_line_comment{ + static bool is_comment(const char*line){ + return single_line_comment::is_comment(line) || empty_line_comment::is_comment(line); + } + }; + + template + struct no_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + ++col_begin; + return col_begin; + } + + static void unescape(char*&, char*&){ + + } + }; + + template + struct double_quote_escape{ + static const char*find_next_column_end(const char*col_begin){ + while(*col_begin != sep && *col_begin != '\0') + if(*col_begin != quote) + ++col_begin; + else{ + do{ + ++col_begin; + while(*col_begin != quote){ + if(*col_begin == '\0') + throw error::escaped_string_not_closed(); + ++col_begin; + } + ++col_begin; + }while(*col_begin == quote); + } + return col_begin; + } + + static void unescape(char*&col_begin, char*&col_end){ + if(col_end - col_begin >= 2){ + if(*col_begin == quote && *(col_end-1) == quote){ + ++col_begin; + --col_end; + char*out = col_begin; + for(char*in = col_begin; in!=col_end; ++in){ + if(*in == quote && (in+1) != col_end && *(in+1) == quote){ + ++in; + } + *out = *in; + ++out; + } + col_end = out; + *col_end = '\0'; + } + } + + } + }; + + struct throw_on_overflow{ + template + static void on_overflow(T&){ + throw error::integer_overflow(); + } + + template + static void on_underflow(T&){ + throw error::integer_underflow(); + } + }; + + struct ignore_overflow{ + template + static void on_overflow(T&){} + + template + static void on_underflow(T&){} + }; + + struct set_to_max_on_overflow{ + template + static void on_overflow(T&x){ + x = std::numeric_limits::max(); + } + + template + static void on_underflow(T&x){ + x = std::numeric_limits::min(); + } + }; + + + namespace detail{ + template + void chop_next_column( + char*&line, char*&col_begin, char*&col_end + ){ + assert(line != nullptr); + + col_begin = line; + // the col_begin + (... - col_begin) removes the constness + col_end = col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin); + + if(*col_end == '\0'){ + line = nullptr; + }else{ + *col_end = '\0'; + line = col_end + 1; + } + } + + template + void parse_line( + char*line, + char**sorted_col, + const std::vector&col_order + ){ + for(std::size_t i=0; i(line, col_begin, col_end); + + if(col_order[i] != -1){ + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + sorted_col[col_order[i]] = col_begin; + } + } + if(line != nullptr) + throw ::io::error::too_many_columns(); + } + + template + void parse_header_line( + char*line, + std::vector&col_order, + const std::string*col_name, + ignore_column ignore_policy + ){ + col_order.clear(); + + bool found[column_count]; + std::fill(found, found + column_count, false); + while(line){ + char*col_begin,*col_end; + chop_next_column(line, col_begin, col_end); + + trim_policy::trim(col_begin, col_end); + quote_policy::unescape(col_begin, col_end); + + for(unsigned i=0; i + void parse(char*col, char &x){ + if(!*col) + throw error::invalid_single_character(); + x = *col; + ++col; + if(*col) + throw error::invalid_single_character(); + } + + template + void parse(char*col, std::string&x){ + x = col; + } + + template + void parse(char*col, const char*&x){ + x = col; + } + + template + void parse(char*col, char*&x){ + x = col; + } + + template + void parse_unsigned_integer(const char*col, T&x){ + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x > (std::numeric_limits::max()-y)/10){ + overflow_policy::on_overflow(x); + return; + } + x = 10*x+y; + }else + throw error::no_digit(); + ++col; + } + } + + templatevoid parse(char*col, unsigned char &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned short &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned int &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned long &x) + {parse_unsigned_integer(col, x);} + templatevoid parse(char*col, unsigned long long &x) + {parse_unsigned_integer(col, x);} + + template + void parse_signed_integer(const char*col, T&x){ + if(*col == '-'){ + ++col; + + x = 0; + while(*col != '\0'){ + if('0' <= *col && *col <= '9'){ + T y = *col - '0'; + if(x < (std::numeric_limits::min()+y)/10){ + overflow_policy::on_underflow(x); + return; + } + x = 10*x-y; + }else + throw error::no_digit(); + ++col; + } + return; + }else if(*col == '+') + ++col; + parse_unsigned_integer(col, x); + } + + templatevoid parse(char*col, signed char &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed short &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed int &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed long &x) + {parse_signed_integer(col, x);} + templatevoid parse(char*col, signed long long &x) + {parse_signed_integer(col, x);} + + template + void parse_float(const char*col, T&x){ + bool is_neg = false; + if(*col == '-'){ + is_neg = true; + ++col; + }else if(*col == '+') + ++col; + + x = 0; + while('0' <= *col && *col <= '9'){ + int y = *col - '0'; + x *= 10; + x += y; + ++col; + } + + if(*col == '.'|| *col == ','){ + ++col; + T pos = 1; + while('0' <= *col && *col <= '9'){ + pos /= 10; + int y = *col - '0'; + ++col; + x += y*pos; + } + } + + if(*col == 'e' || *col == 'E'){ + ++col; + int e; + + parse_signed_integer(col, e); + + if(e != 0){ + T base; + if(e < 0){ + base = 0.1; + e = -e; + }else{ + base = 10; + } + + while(e != 1){ + if((e & 1) == 0){ + base = base*base; + e >>= 1; + }else{ + x *= base; + --e; + } + } + x *= base; + } + }else{ + if(*col != '\0') + throw error::no_digit(); + } + + if(is_neg) + x = -x; + } + + template void parse(char*col, float&x) { parse_float(col, x); } + template void parse(char*col, double&x) { parse_float(col, x); } + template void parse(char*col, long double&x) { parse_float(col, x); } + + template + void parse(char*col, T&x){ + // Mute unused variable compiler warning + (void)col; + (void)x; + // GCC evalutes "false" when reading the template and + // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why + // this strange construct is used. + static_assert(sizeof(T)!=sizeof(T), + "Can not parse this type. Only buildin integrals, floats, char, char*, const char* and std::string are supported"); + } + + } + + template, + class quote_policy = no_quote_escape<','>, + class overflow_policy = throw_on_overflow, + class comment_policy = no_comment + > + class CSVReader{ + private: + LineReader in; + + char*row[column_count]; + std::string column_names[column_count]; + + std::vectorcol_order; + + template + void set_column_names(std::string s, ColNames...cols){ + column_names[column_count-sizeof...(ColNames)-1] = std::move(s); + set_column_names(std::forward(cols)...); + } + + void set_column_names(){} + + + public: + CSVReader() = delete; + CSVReader(const CSVReader&) = delete; + CSVReader&operator=(const CSVReader&); + + template + explicit CSVReader(Args&&...args):in(std::forward(args)...){ + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i + void read_header(ignore_column ignore_policy, ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, "too many column names specified"); + try{ + set_column_names(std::forward(cols)...); + + char*line; + do{ + line = in.next_line(); + if(!line) + throw error::header_missing(); + }while(comment_policy::is_comment(line)); + + detail::parse_header_line + + (line, col_order, column_names, ignore_policy); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + } + + template + void set_header(ColNames...cols){ + static_assert(sizeof...(ColNames)>=column_count, + "not enough column names specified"); + static_assert(sizeof...(ColNames)<=column_count, + "too many column names specified"); + set_column_names(std::forward(cols)...); + std::fill(row, row+column_count, nullptr); + col_order.resize(column_count); + for(unsigned i=0; i + void parse_helper(std::size_t r, T&t, ColType&...cols){ + if(row[r]){ + try{ + try{ + ::io::detail::parse(row[r], t); + }catch(error::with_column_content&err){ + err.set_column_content(row[r]); + throw; + } + }catch(error::with_column_name&err){ + err.set_column_name(column_names[r].c_str()); + throw; + } + } + parse_helper(r+1, cols...); + } + + + public: + template + bool read_row(ColType& ...cols){ + static_assert(sizeof...(ColType)>=column_count, + "not enough columns specified"); + static_assert(sizeof...(ColType)<=column_count, + "too many columns specified"); + try{ + try{ + + char*line; + do{ + line = in.next_line(); + if(!line) + return false; + }while(comment_policy::is_comment(line)); + + detail::parse_line + (line, row, col_order); + + parse_helper(0, cols...); + }catch(error::with_file_name&err){ + err.set_file_name(in.get_truncated_file_name()); + throw; + } + }catch(error::with_file_line&err){ + err.set_file_line(in.get_file_line()); + throw; + } + + return true; + } + }; +} +#endif + diff --git a/vendor/rawr-ebooks b/vendor/rawr-ebooks new file mode 160000 index 0000000..d75685e --- /dev/null +++ b/vendor/rawr-ebooks @@ -0,0 +1 @@ +Subproject commit d75685e69f9a5d3cfc255aa921005fc40ae6e585 -- cgit 1.4.1