From 64f26d4a3b80969e08a607f80dde87d49ad5c2e3 Mon Sep 17 00:00:00 2001 From: Star Rauchenberger Date: Fri, 3 Feb 2023 09:15:28 -0500 Subject: Added word frequency information --- generator/generator.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'generator/generator.cpp') diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -1,4 +1,5 @@ #include "generator.h" +#include #include #include #include @@ -23,6 +24,7 @@ namespace verbly { std::string wordNetPath, std::string cmudictPath, std::string imageNetPath, + std::string wordfreqPath, std::string outputPath, std::string imageNetOutput) : verbNetPath_(verbNetPath), @@ -30,6 +32,7 @@ namespace verbly { wordNetPath_(wordNetPath), cmudictPath_(cmudictPath), imageNetPath_(imageNetPath), + wordfreqPath_(wordfreqPath), db_(outputPath, hatkirby::dbmode::create), imageNetOutput_(imageNetOutput) { @@ -117,6 +120,10 @@ namespace verbly { // and then only generate pronunciations for already-exisiting forms. readCmudictPronunciations(); + // Reads word (really form) frequency information from a corpus, formatted + // as a CSV file with the form in one column and the frequency in another. + readWordFrequency(); + // Writes the database schema writeSchema(); @@ -624,6 +631,39 @@ namespace verbly { } } + void generator::readWordFrequency() + { + std::list lines(readFile(wordfreqPath_)); + + hatkirby::progress ppgs( + "Reading word frequencies...", + lines.size()); + + for (std::string line : lines) + { + ppgs.update(); + + std::regex freqline("([a-z]+),([0-9]+)"); + std::smatch freqline_data; + if (std::regex_search(line, freqline_data, freqline)) + { + std::string text = freqline_data[1]; + + if (!formByText_.count(text)) + { + continue; + } + + std::string freqnumstr = freqline_data[2]; + long long freqnumnum = std::atoll(freqnumstr.c_str()); + formByText_[text]->setFrequency( + freqnumnum > std::numeric_limits::max() + ? std::numeric_limits::max() + : freqnumnum); + } + } + } + void generator::writeSchema() { std::ifstream file("schema.sql"); -- cgit 1.4.1