From 64f26d4a3b80969e08a607f80dde87d49ad5c2e3 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Fri, 3 Feb 2023 09:15:28 -0500 Subject: Added word frequency information --- generator/form.cpp | 27 ++++++++++++++++----------- generator/form.h | 11 +++++++++++ generator/generator.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ generator/generator.h | 4 ++++ generator/main.cpp | 7 ++++--- generator/schema.sql | 3 ++- lib/form.cpp | 9 ++++++++- lib/form.h | 28 ++++++++++++++++++++++++++++ 8 files changed, 113 insertions(+), 16 deletions(-) diff --git a/generator/form.cpp b/generator/form.cpp index a88363b..460469f 100644 --- a/generator/form.cpp +++ b/generator/form.cpp @@ -28,17 +28,22 @@ namespace verbly { { // Serialize the form first. { - db.insertIntoTable( - "forms", - { - { "form_id", arg.getId() }, - { "form", arg.getText() }, - { "complexity", arg.getComplexity() }, - { "proper", arg.isProper() }, - { "length", arg.getLength() }, - { "anagram_set_id", arg.getAnagramSetId() }, - { "reverse_form_id", arg.getReverseId() } - }); + std::list fields = { + { "form_id", arg.getId() }, + { "form", arg.getText() }, + { "complexity", arg.getComplexity() }, + { "proper", arg.isProper() }, + { "length", arg.getLength() }, + { "anagram_set_id", arg.getAnagramSetId() }, + { "reverse_form_id", arg.getReverseId() } + }; + + if (arg.getFrequency() > 0) + { + fields.emplace_back("frequency", arg.getFrequency()); + } + + db.insertIntoTable("forms", std::move(fields)); } // Then, serialize the form/pronunciation relationship. diff --git a/generator/form.h b/generator/form.h index c83bbdc..1686f9b 100644 --- a/generator/form.h +++ b/generator/form.h @@ -63,6 +63,16 @@ namespace verbly { return reverse_id_; } + void setFrequency(int freq) + { + frequency_ = freq; + } + + int getFrequency() const + { + return frequency_; + } + std::set getPronunciations() const { return pronunciations_; @@ -79,6 +89,7 @@ namespace verbly { const int length_; const int anagram_set_id_; int reverse_id_ = -1; + int frequency_ = 0; std::set pronunciations_; diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -1,4 +1,5 @@ #include "generator.h" +#include #include #include #include @@ -23,6 +24,7 @@ namespace verbly { std::string wordNetPath, std::string cmudictPath, std::string imageNetPath, + std::string wordfreqPath, std::string outputPath, std::string imageNetOutput) : verbNetPath_(verbNetPath), @@ -30,6 +32,7 @@ namespace verbly { wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), imageNetPath_(imageNetPath), + wordfreqPath_(wordfreqPath), db_(outputPath, hatkirby::dbmode::create), imageNetOutput_(imageNetOutput) { @@ -117,6 +120,10 @@ namespace verbly { // and then only generate pronunciations for already-exisiting forms. readCmudictPronunciations(); + // Reads word (really form) frequency information from a corpus, formatted + // as a CSV file with the form in one column and the frequency in another. + readWordFrequency(); + // Writes the database schema writeSchema(); @@ -624,6 +631,39 @@ namespace verbly { } } + void generator::readWordFrequency() + { + std::list lines(readFile(wordfreqPath_)); + + hatkirby::progress ppgs( + "Reading word frequencies...", + lines.size()); + + for (std::string line : lines) + { + ppgs.update(); + + std::regex freqline("([a-z]+),([0-9]+)"); + std::smatch freqline_data; + if (std::regex_search(line, freqline_data, freqline)) + { + std::string text = freqline_data[1]; + + if (!formByText_.count(text)) + { + continue; + } + + std::string freqnumstr = freqline_data[2]; + long long freqnumnum = std::atoll(freqnumstr.c_str()); + formByText_[text]->setFrequency( + freqnumnum > std::numeric_limits::max() + ? std::numeric_limits::max() + : freqnumnum); + } + } + } + void generator::writeSchema() { std::ifstream file("schema.sql"); diff --git a/generator/generator.h b/generator/generator.h index 3d51c35..82dec66 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -35,6 +35,7 @@ namespace verbly { std::string wordNetPath, std::string cmudictPath, std::string imageNetPath, + std::string wordfreqPath, std::string outputPath, std::string imageNetOutput); @@ -62,6 +63,8 @@ namespace verbly { void readCmudictPronunciations(); + void readWordFrequency(); + void writeSchema(); void writeVersion(); @@ -125,6 +128,7 @@ namespace verbly { std::string wordNetPath_; std::string cmudictPath_; std::string imageNetPath_; + std::string wordfreqPath_; // Output diff --git a/generator/main.cpp b/generator/main.cpp index 7db7203..7d6e4dc 100644 --- a/generator/main.cpp +++ b/generator/main.cpp @@ -4,23 +4,24 @@ void printUsage() { - std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output ino" << std::endl; + std::cout << "usage: generator verbnet agid wordnet cmudict imagenet wordfreq output ino" << std::endl; std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; std::cout << "agid :: path to an AGID infl.txt file" << std::endl; std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; + std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; std::cout << "output :: datafile output path" << std::endl; std::cout << "ino :: imagenet directory output path" << std::endl; } int main(int argc, char** argv) { - if (argc == 8) + if (argc == 9) { try { - verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]); + verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]); try { diff --git a/generator/schema.sql b/generator/schema.sql index 8c910f4..34c6907 100644 --- a/generator/schema.sql +++ b/generator/schema.sql @@ -162,7 +162,8 @@ CREATE TABLE `forms` ( `proper` SMALLINT NOT NULL, `length` SMALLINT NOT NULL, `anagram_set_id` INTEGER NOT NULL, - `reverse_form_id` INTEGER NOT NULL + `reverse_form_id` INTEGER NOT NULL, + `frequency` INTEGER ); CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); diff --git a/lib/form.cpp b/lib/form.cpp index 256dc59..fa72c8a 100644 --- a/lib/form.cpp +++ b/lib/form.cpp @@ -8,13 +8,14 @@ namespace verbly { const object form::objectType = object::form; - const std::list form::select = {"form_id", "form", "complexity", "proper", "length"}; + const std::list form::select = {"form_id", "form", "complexity", "proper", "length", "frequency"}; const field form::id = field::integerField(object::form, "form_id"); const field form::text = field::stringField(object::form, "form"); const field form::complexity = field::integerField(object::form, "complexity"); const field form::proper = field::booleanField(object::form, "proper"); const field form::length = field::integerField(object::form, "length"); + const field form::frequency = field::integerField(object::form, "frequency"); const field form::pronunciations = field::joinThrough(object::form, "form_id", object::pronunciation, "forms_pronunciations", "pronunciation_id"); @@ -36,6 +37,12 @@ namespace verbly { proper_ = (std::get(row[3]) == 1); length_ = std::get(row[4]); + if (!mpark::holds_alternative(row[5])) + { + hasFreq_ = true; + frequency_ = mpark::get(row[5]); + } + pronunciations_ = db.pronunciations(*this, pronunciation::id, -1).all(); } diff --git a/lib/form.h b/lib/form.h index 39f53aa..fb6b733 100644 --- a/lib/form.h +++ b/lib/form.h @@ -82,6 +82,31 @@ namespace verbly { return length_; } + bool hasFrequency() const + { + if (!valid_) + { + throw std::domain_error("Bad access to uninitialized form"); + } + + return hasFreq_; + } + + bool getFrequency() const + { + if (!valid_) + { + throw std::domain_error("Bad access to uninitialized form"); + } + + if (!hasFreq_) + { + throw std::domain_error("Form does not have a frequency"); + } + + return frequency_; + } + const std::vector& getPronunciations() const { if (!valid_) @@ -109,6 +134,7 @@ namespace verbly { static const field complexity; static const field proper; static const field length; + static const field frequency; operator filter() const { @@ -149,6 +175,8 @@ namespace verbly { int complexity_; bool proper_; int length_; + bool hasFreq_ = false; + int frequency_; std::vector pronunciations_; }; -- cgit 1.4.1