diff options
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r-- | generator/generator.cpp | 40 |
1 files changed, 40 insertions, 0 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,4 +1,5 @@ | |||
1 | #include "generator.h" | 1 | #include "generator.h" |
2 | #include <cstdlib> | ||
2 | #include <stdexcept> | 3 | #include <stdexcept> |
3 | #include <iostream> | 4 | #include <iostream> |
4 | #include <regex> | 5 | #include <regex> |
@@ -23,6 +24,7 @@ namespace verbly { | |||
23 | std::string wordNetPath, | 24 | std::string wordNetPath, |
24 | std::string cmudictPath, | 25 | std::string cmudictPath, |
25 | std::string imageNetPath, | 26 | std::string imageNetPath, |
27 | std::string wordfreqPath, | ||
26 | std::string outputPath, | 28 | std::string outputPath, |
27 | std::string imageNetOutput) : | 29 | std::string imageNetOutput) : |
28 | verbNetPath_(verbNetPath), | 30 | verbNetPath_(verbNetPath), |
@@ -30,6 +32,7 @@ namespace verbly { | |||
30 | wordNetPath_(wordNetPath), | 32 | wordNetPath_(wordNetPath), |
31 | cmudictPath_(cmudictPath), | 33 | cmudictPath_(cmudictPath), |
32 | imageNetPath_(imageNetPath), | 34 | imageNetPath_(imageNetPath), |
35 | wordfreqPath_(wordfreqPath), | ||
33 | db_(outputPath, hatkirby::dbmode::create), | 36 | db_(outputPath, hatkirby::dbmode::create), |
34 | imageNetOutput_(imageNetOutput) | 37 | imageNetOutput_(imageNetOutput) |
35 | { | 38 | { |
@@ -117,6 +120,10 @@ namespace verbly { | |||
117 | // and then only generate pronunciations for already-exisiting forms. | 120 | // and then only generate pronunciations for already-exisiting forms. |
118 | readCmudictPronunciations(); | 121 | readCmudictPronunciations(); |
119 | 122 | ||
123 | // Reads word (really form) frequency information from a corpus, formatted | ||
124 | // as a CSV file with the form in one column and the frequency in another. | ||
125 | readWordFrequency(); | ||
126 | |||
120 | // Writes the database schema | 127 | // Writes the database schema |
121 | writeSchema(); | 128 | writeSchema(); |
122 | 129 | ||
@@ -624,6 +631,39 @@ namespace verbly { | |||
624 | } | 631 | } |
625 | } | 632 | } |
626 | 633 | ||
634 | void generator::readWordFrequency() | ||
635 | { | ||
636 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
637 | |||
638 | hatkirby::progress ppgs( | ||
639 | "Reading word frequencies...", | ||
640 | lines.size()); | ||
641 | |||
642 | for (std::string line : lines) | ||
643 | { | ||
644 | ppgs.update(); | ||
645 | |||
646 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
647 | std::smatch freqline_data; | ||
648 | if (std::regex_search(line, freqline_data, freqline)) | ||
649 | { | ||
650 | std::string text = freqline_data[1]; | ||
651 | |||
652 | if (!formByText_.count(text)) | ||
653 | { | ||
654 | continue; | ||
655 | } | ||
656 | |||
657 | std::string freqnumstr = freqline_data[2]; | ||
658 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
659 | formByText_[text]->setFrequency( | ||
660 | freqnumnum > std::numeric_limits<int>::max() | ||
661 | ? std::numeric_limits<int>::max() | ||
662 | : freqnumnum); | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | |||
627 | void generator::writeSchema() | 667 | void generator::writeSchema() |
628 | { | 668 | { |
629 | std::ifstream file("schema.sql"); | 669 | std::ifstream file("schema.sql"); |