diff options
Diffstat (limited to 'generator/generator.cpp')
| -rw-r--r-- | generator/generator.cpp | 40 |
1 files changed, 40 insertions, 0 deletions
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | #include "generator.h" | 1 | #include "generator.h" |
| 2 | #include <cstdlib> | ||
| 2 | #include <stdexcept> | 3 | #include <stdexcept> |
| 3 | #include <iostream> | 4 | #include <iostream> |
| 4 | #include <regex> | 5 | #include <regex> |
| @@ -23,6 +24,7 @@ namespace verbly { | |||
| 23 | std::string wordNetPath, | 24 | std::string wordNetPath, |
| 24 | std::string cmudictPath, | 25 | std::string cmudictPath, |
| 25 | std::string imageNetPath, | 26 | std::string imageNetPath, |
| 27 | std::string wordfreqPath, | ||
| 26 | std::string outputPath, | 28 | std::string outputPath, |
| 27 | std::string imageNetOutput) : | 29 | std::string imageNetOutput) : |
| 28 | verbNetPath_(verbNetPath), | 30 | verbNetPath_(verbNetPath), |
| @@ -30,6 +32,7 @@ namespace verbly { | |||
| 30 | wordNetPath_(wordNetPath), | 32 | wordNetPath_(wordNetPath), |
| 31 | cmudictPath_(cmudictPath), | 33 | cmudictPath_(cmudictPath), |
| 32 | imageNetPath_(imageNetPath), | 34 | imageNetPath_(imageNetPath), |
| 35 | wordfreqPath_(wordfreqPath), | ||
| 33 | db_(outputPath, hatkirby::dbmode::create), | 36 | db_(outputPath, hatkirby::dbmode::create), |
| 34 | imageNetOutput_(imageNetOutput) | 37 | imageNetOutput_(imageNetOutput) |
| 35 | { | 38 | { |
| @@ -117,6 +120,10 @@ namespace verbly { | |||
| 117 | // and then only generate pronunciations for already-exisiting forms. | 120 | // and then only generate pronunciations for already-exisiting forms. |
| 118 | readCmudictPronunciations(); | 121 | readCmudictPronunciations(); |
| 119 | 122 | ||
| 123 | // Reads word (really form) frequency information from a corpus, formatted | ||
| 124 | // as a CSV file with the form in one column and the frequency in another. | ||
| 125 | readWordFrequency(); | ||
| 126 | |||
| 120 | // Writes the database schema | 127 | // Writes the database schema |
| 121 | writeSchema(); | 128 | writeSchema(); |
| 122 | 129 | ||
| @@ -624,6 +631,39 @@ namespace verbly { | |||
| 624 | } | 631 | } |
| 625 | } | 632 | } |
| 626 | 633 | ||
| 634 | void generator::readWordFrequency() | ||
| 635 | { | ||
| 636 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
| 637 | |||
| 638 | hatkirby::progress ppgs( | ||
| 639 | "Reading word frequencies...", | ||
| 640 | lines.size()); | ||
| 641 | |||
| 642 | for (std::string line : lines) | ||
| 643 | { | ||
| 644 | ppgs.update(); | ||
| 645 | |||
| 646 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
| 647 | std::smatch freqline_data; | ||
| 648 | if (std::regex_search(line, freqline_data, freqline)) | ||
| 649 | { | ||
| 650 | std::string text = freqline_data[1]; | ||
| 651 | |||
| 652 | if (!formByText_.count(text)) | ||
| 653 | { | ||
| 654 | continue; | ||
| 655 | } | ||
| 656 | |||
| 657 | std::string freqnumstr = freqline_data[2]; | ||
| 658 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
| 659 | formByText_[text]->setFrequency( | ||
| 660 | freqnumnum > std::numeric_limits<int>::max() | ||
| 661 | ? std::numeric_limits<int>::max() | ||
| 662 | : freqnumnum); | ||
| 663 | } | ||
| 664 | } | ||
| 665 | } | ||
| 666 | |||
| 627 | void generator::writeSchema() | 667 | void generator::writeSchema() |
| 628 | { | 668 | { |
| 629 | std::ifstream file("schema.sql"); | 669 | std::ifstream file("schema.sql"); |
