From 64f26d4a3b80969e08a607f80dde87d49ad5c2e3 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Fri, 3 Feb 2023 09:15:28 -0500 Subject: Added word frequency information --- generator/form.cpp | 27 ++++++++++++++++----------- generator/form.h | 11 +++++++++++ generator/generator.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ generator/generator.h | 4 ++++ generator/main.cpp | 7 ++++--- generator/schema.sql | 3 ++- 6 files changed, 77 insertions(+), 15 deletions(-) (limited to 'generator') diff --git a/generator/form.cpp b/generator/form.cpp index a88363b..460469f 100644 --- a/generator/form.cpp +++ b/generator/form.cpp @@ -28,17 +28,22 @@ namespace verbly { { // Serialize the form first. { - db.insertIntoTable( - "forms", - { - { "form_id", arg.getId() }, - { "form", arg.getText() }, - { "complexity", arg.getComplexity() }, - { "proper", arg.isProper() }, - { "length", arg.getLength() }, - { "anagram_set_id", arg.getAnagramSetId() }, - { "reverse_form_id", arg.getReverseId() } - }); + std::list fields = { + { "form_id", arg.getId() }, + { "form", arg.getText() }, + { "complexity", arg.getComplexity() }, + { "proper", arg.isProper() }, + { "length", arg.getLength() }, + { "anagram_set_id", arg.getAnagramSetId() }, + { "reverse_form_id", arg.getReverseId() } + }; + + if (arg.getFrequency() > 0) + { + fields.emplace_back("frequency", arg.getFrequency()); + } + + db.insertIntoTable("forms", std::move(fields)); } // Then, serialize the form/pronunciation relationship. diff --git a/generator/form.h b/generator/form.h index c83bbdc..1686f9b 100644 --- a/generator/form.h +++ b/generator/form.h @@ -63,6 +63,16 @@ namespace verbly { return reverse_id_; } + void setFrequency(int freq) + { + frequency_ = freq; + } + + int getFrequency() const + { + return frequency_; + } + std::set getPronunciations() const { return pronunciations_; @@ -79,6 +89,7 @@ namespace verbly { const int length_; const int anagram_set_id_; int reverse_id_ = -1; + int frequency_ = 0; std::set pronunciations_; diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -1,4 +1,5 @@ #include "generator.h" +#include #include #include #include @@ -23,6 +24,7 @@ namespace verbly { std::string wordNetPath, std::string cmudictPath, std::string imageNetPath, + std::string wordfreqPath, std::string outputPath, std::string imageNetOutput) : verbNetPath_(verbNetPath), @@ -30,6 +32,7 @@ namespace verbly { wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), imageNetPath_(imageNetPath), + wordfreqPath_(wordfreqPath), db_(outputPath, hatkirby::dbmode::create), imageNetOutput_(imageNetOutput) { @@ -117,6 +120,10 @@ namespace verbly { // and then only generate pronunciations for already-exisiting forms. readCmudictPronunciations(); + // Reads word (really form) frequency information from a corpus, formatted + // as a CSV file with the form in one column and the frequency in another. + readWordFrequency(); + // Writes the database schema writeSchema(); @@ -624,6 +631,39 @@ namespace verbly { } } + void generator::readWordFrequency() + { + std::list lines(readFile(wordfreqPath_)); + + hatkirby::progress ppgs( + "Reading word frequencies...", + lines.size()); + + for (std::string line : lines) + { + ppgs.update(); + + std::regex freqline("([a-z]+),([0-9]+)"); + std::smatch freqline_data; + if (std::regex_search(line, freqline_data, freqline)) + { + std::string text = freqline_data[1]; + + if (!formByText_.count(text)) + { + continue; + } + + std::string freqnumstr = freqline_data[2]; + long long freqnumnum = std::atoll(freqnumstr.c_str()); + formByText_[text]->setFrequency( + freqnumnum > std::numeric_limits::max() + ? std::numeric_limits::max() + : freqnumnum); + } + } + } + void generator::writeSchema() { std::ifstream file("schema.sql"); diff --git a/generator/generator.h b/generator/generator.h index 3d51c35..82dec66 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -35,6 +35,7 @@ namespace verbly { std::string wordNetPath, std::string cmudictPath, std::string imageNetPath, + std::string wordfreqPath, std::string outputPath, std::string imageNetOutput); @@ -62,6 +63,8 @@ namespace verbly { void readCmudictPronunciations(); + void readWordFrequency(); + void writeSchema(); void writeVersion(); @@ -125,6 +128,7 @@ namespace verbly { std::string wordNetPath_; std::string cmudictPath_; std::string imageNetPath_; + std::string wordfreqPath_; // Output diff --git a/generator/main.cpp b/generator/main.cpp index 7db7203..7d6e4dc 100644 --- a/generator/main.cpp +++ b/generator/main.cpp @@ -4,23 +4,24 @@ void printUsage() { - std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output ino" << std::endl; + std::cout << "usage: generator verbnet agid wordnet cmudict imagenet wordfreq output ino" << std::endl; std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; std::cout << "agid :: path to an AGID infl.txt file" << std::endl; std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; + std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; std::cout << "output :: datafile output path" << std::endl; std::cout << "ino :: imagenet directory output path" << std::endl; } int main(int argc, char** argv) { - if (argc == 8) + if (argc == 9) { try { - verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]); + verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]); try { diff --git a/generator/schema.sql b/generator/schema.sql index 8c910f4..34c6907 100644 --- a/generator/schema.sql +++ b/generator/schema.sql @@ -162,7 +162,8 @@ CREATE TABLE `forms` ( `proper` SMALLINT NOT NULL, `length` SMALLINT NOT NULL, `anagram_set_id` INTEGER NOT NULL, - `reverse_form_id` INTEGER NOT NULL + `reverse_form_id` INTEGER NOT NULL, + `frequency` INTEGER ); CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); -- cgit 1.4.1