summary refs log tree commit diff stats
path: root/generator/generator.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'generator/generator.cpp')
-rw-r--r--generator/generator.cpp40
1 files changed, 40 insertions, 0 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,4 +1,5 @@
1#include "generator.h" 1#include "generator.h"
2#include <cstdlib>
2#include <stdexcept> 3#include <stdexcept>
3#include <iostream> 4#include <iostream>
4#include <regex> 5#include <regex>
@@ -23,6 +24,7 @@ namespace verbly {
23 std::string wordNetPath, 24 std::string wordNetPath,
24 std::string cmudictPath, 25 std::string cmudictPath,
25 std::string imageNetPath, 26 std::string imageNetPath,
27 std::string wordfreqPath,
26 std::string outputPath, 28 std::string outputPath,
27 std::string imageNetOutput) : 29 std::string imageNetOutput) :
28 verbNetPath_(verbNetPath), 30 verbNetPath_(verbNetPath),
@@ -30,6 +32,7 @@ namespace verbly {
30 wordNetPath_(wordNetPath), 32 wordNetPath_(wordNetPath),
31 cmudictPath_(cmudictPath), 33 cmudictPath_(cmudictPath),
32 imageNetPath_(imageNetPath), 34 imageNetPath_(imageNetPath),
35 wordfreqPath_(wordfreqPath),
33 db_(outputPath, hatkirby::dbmode::create), 36 db_(outputPath, hatkirby::dbmode::create),
34 imageNetOutput_(imageNetOutput) 37 imageNetOutput_(imageNetOutput)
35 { 38 {
@@ -117,6 +120,10 @@ namespace verbly {
117 // and then only generate pronunciations for already-exisiting forms. 120 // and then only generate pronunciations for already-exisiting forms.
118 readCmudictPronunciations(); 121 readCmudictPronunciations();
119 122
123 // Reads word (really form) frequency information from a corpus, formatted
124 // as a CSV file with the form in one column and the frequency in another.
125 readWordFrequency();
126
120 // Writes the database schema 127 // Writes the database schema
121 writeSchema(); 128 writeSchema();
122 129
@@ -624,6 +631,39 @@ namespace verbly {
624 } 631 }
625 } 632 }
626 633
634 void generator::readWordFrequency()
635 {
636 std::list<std::string> lines(readFile(wordfreqPath_));
637
638 hatkirby::progress ppgs(
639 "Reading word frequencies...",
640 lines.size());
641
642 for (std::string line : lines)
643 {
644 ppgs.update();
645
646 std::regex freqline("([a-z]+),([0-9]+)");
647 std::smatch freqline_data;
648 if (std::regex_search(line, freqline_data, freqline))
649 {
650 std::string text = freqline_data[1];
651
652 if (!formByText_.count(text))
653 {
654 continue;
655 }
656
657 std::string freqnumstr = freqline_data[2];
658 long long freqnumnum = std::atoll(freqnumstr.c_str());
659 formByText_[text]->setFrequency(
660 freqnumnum > std::numeric_limits<int>::max()
661 ? std::numeric_limits<int>::max()
662 : freqnumnum);
663 }
664 }
665 }
666
627 void generator::writeSchema() 667 void generator::writeSchema()
628 { 668 {
629 std::ifstream file("schema.sql"); 669 std::ifstream file("schema.sql");