diff options
Diffstat (limited to 'generator')
| -rw-r--r-- | generator/form.cpp | 27 | ||||
| -rw-r--r-- | generator/form.h | 11 | ||||
| -rw-r--r-- | generator/generator.cpp | 40 | ||||
| -rw-r--r-- | generator/generator.h | 4 | ||||
| -rw-r--r-- | generator/main.cpp | 7 | ||||
| -rw-r--r-- | generator/schema.sql | 3 |
6 files changed, 77 insertions, 15 deletions
| diff --git a/generator/form.cpp b/generator/form.cpp index a88363b..460469f 100644 --- a/generator/form.cpp +++ b/generator/form.cpp | |||
| @@ -28,17 +28,22 @@ namespace verbly { | |||
| 28 | { | 28 | { |
| 29 | // Serialize the form first. | 29 | // Serialize the form first. |
| 30 | { | 30 | { |
| 31 | db.insertIntoTable( | 31 | std::list<hatkirby::column> fields = { |
| 32 | "forms", | 32 | { "form_id", arg.getId() }, |
| 33 | { | 33 | { "form", arg.getText() }, |
| 34 | { "form_id", arg.getId() }, | 34 | { "complexity", arg.getComplexity() }, |
| 35 | { "form", arg.getText() }, | 35 | { "proper", arg.isProper() }, |
| 36 | { "complexity", arg.getComplexity() }, | 36 | { "length", arg.getLength() }, |
| 37 | { "proper", arg.isProper() }, | 37 | { "anagram_set_id", arg.getAnagramSetId() }, |
| 38 | { "length", arg.getLength() }, | 38 | { "reverse_form_id", arg.getReverseId() } |
| 39 | { "anagram_set_id", arg.getAnagramSetId() }, | 39 | }; |
| 40 | { "reverse_form_id", arg.getReverseId() } | 40 | |
| 41 | }); | 41 | if (arg.getFrequency() > 0) |
| 42 | { | ||
| 43 | fields.emplace_back("frequency", arg.getFrequency()); | ||
| 44 | } | ||
| 45 | |||
| 46 | db.insertIntoTable("forms", std::move(fields)); | ||
| 42 | } | 47 | } |
| 43 | 48 | ||
| 44 | // Then, serialize the form/pronunciation relationship. | 49 | // Then, serialize the form/pronunciation relationship. |
| diff --git a/generator/form.h b/generator/form.h index c83bbdc..1686f9b 100644 --- a/generator/form.h +++ b/generator/form.h | |||
| @@ -63,6 +63,16 @@ namespace verbly { | |||
| 63 | return reverse_id_; | 63 | return reverse_id_; |
| 64 | } | 64 | } |
| 65 | 65 | ||
| 66 | void setFrequency(int freq) | ||
| 67 | { | ||
| 68 | frequency_ = freq; | ||
| 69 | } | ||
| 70 | |||
| 71 | int getFrequency() const | ||
| 72 | { | ||
| 73 | return frequency_; | ||
| 74 | } | ||
| 75 | |||
| 66 | std::set<const pronunciation*> getPronunciations() const | 76 | std::set<const pronunciation*> getPronunciations() const |
| 67 | { | 77 | { |
| 68 | return pronunciations_; | 78 | return pronunciations_; |
| @@ -79,6 +89,7 @@ namespace verbly { | |||
| 79 | const int length_; | 89 | const int length_; |
| 80 | const int anagram_set_id_; | 90 | const int anagram_set_id_; |
| 81 | int reverse_id_ = -1; | 91 | int reverse_id_ = -1; |
| 92 | int frequency_ = 0; | ||
| 82 | 93 | ||
| 83 | std::set<const pronunciation*> pronunciations_; | 94 | std::set<const pronunciation*> pronunciations_; |
| 84 | 95 | ||
| diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | #include "generator.h" | 1 | #include "generator.h" |
| 2 | #include <cstdlib> | ||
| 2 | #include <stdexcept> | 3 | #include <stdexcept> |
| 3 | #include <iostream> | 4 | #include <iostream> |
| 4 | #include <regex> | 5 | #include <regex> |
| @@ -23,6 +24,7 @@ namespace verbly { | |||
| 23 | std::string wordNetPath, | 24 | std::string wordNetPath, |
| 24 | std::string cmudictPath, | 25 | std::string cmudictPath, |
| 25 | std::string imageNetPath, | 26 | std::string imageNetPath, |
| 27 | std::string wordfreqPath, | ||
| 26 | std::string outputPath, | 28 | std::string outputPath, |
| 27 | std::string imageNetOutput) : | 29 | std::string imageNetOutput) : |
| 28 | verbNetPath_(verbNetPath), | 30 | verbNetPath_(verbNetPath), |
| @@ -30,6 +32,7 @@ namespace verbly { | |||
| 30 | wordNetPath_(wordNetPath), | 32 | wordNetPath_(wordNetPath), |
| 31 | cmudictPath_(cmudictPath), | 33 | cmudictPath_(cmudictPath), |
| 32 | imageNetPath_(imageNetPath), | 34 | imageNetPath_(imageNetPath), |
| 35 | wordfreqPath_(wordfreqPath), | ||
| 33 | db_(outputPath, hatkirby::dbmode::create), | 36 | db_(outputPath, hatkirby::dbmode::create), |
| 34 | imageNetOutput_(imageNetOutput) | 37 | imageNetOutput_(imageNetOutput) |
| 35 | { | 38 | { |
| @@ -117,6 +120,10 @@ namespace verbly { | |||
| 117 | // and then only generate pronunciations for already-exisiting forms. | 120 | // and then only generate pronunciations for already-exisiting forms. |
| 118 | readCmudictPronunciations(); | 121 | readCmudictPronunciations(); |
| 119 | 122 | ||
| 123 | // Reads word (really form) frequency information from a corpus, formatted | ||
| 124 | // as a CSV file with the form in one column and the frequency in another. | ||
| 125 | readWordFrequency(); | ||
| 126 | |||
| 120 | // Writes the database schema | 127 | // Writes the database schema |
| 121 | writeSchema(); | 128 | writeSchema(); |
| 122 | 129 | ||
| @@ -624,6 +631,39 @@ namespace verbly { | |||
| 624 | } | 631 | } |
| 625 | } | 632 | } |
| 626 | 633 | ||
| 634 | void generator::readWordFrequency() | ||
| 635 | { | ||
| 636 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
| 637 | |||
| 638 | hatkirby::progress ppgs( | ||
| 639 | "Reading word frequencies...", | ||
| 640 | lines.size()); | ||
| 641 | |||
| 642 | for (std::string line : lines) | ||
| 643 | { | ||
| 644 | ppgs.update(); | ||
| 645 | |||
| 646 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
| 647 | std::smatch freqline_data; | ||
| 648 | if (std::regex_search(line, freqline_data, freqline)) | ||
| 649 | { | ||
| 650 | std::string text = freqline_data[1]; | ||
| 651 | |||
| 652 | if (!formByText_.count(text)) | ||
| 653 | { | ||
| 654 | continue; | ||
| 655 | } | ||
| 656 | |||
| 657 | std::string freqnumstr = freqline_data[2]; | ||
| 658 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
| 659 | formByText_[text]->setFrequency( | ||
| 660 | freqnumnum > std::numeric_limits<int>::max() | ||
| 661 | ? std::numeric_limits<int>::max() | ||
| 662 | : freqnumnum); | ||
| 663 | } | ||
| 664 | } | ||
| 665 | } | ||
| 666 | |||
| 627 | void generator::writeSchema() | 667 | void generator::writeSchema() |
| 628 | { | 668 | { |
| 629 | std::ifstream file("schema.sql"); | 669 | std::ifstream file("schema.sql"); |
| diff --git a/generator/generator.h b/generator/generator.h index 3d51c35..82dec66 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
| @@ -35,6 +35,7 @@ namespace verbly { | |||
| 35 | std::string wordNetPath, | 35 | std::string wordNetPath, |
| 36 | std::string cmudictPath, | 36 | std::string cmudictPath, |
| 37 | std::string imageNetPath, | 37 | std::string imageNetPath, |
| 38 | std::string wordfreqPath, | ||
| 38 | std::string outputPath, | 39 | std::string outputPath, |
| 39 | std::string imageNetOutput); | 40 | std::string imageNetOutput); |
| 40 | 41 | ||
| @@ -62,6 +63,8 @@ namespace verbly { | |||
| 62 | 63 | ||
| 63 | void readCmudictPronunciations(); | 64 | void readCmudictPronunciations(); |
| 64 | 65 | ||
| 66 | void readWordFrequency(); | ||
| 67 | |||
| 65 | void writeSchema(); | 68 | void writeSchema(); |
| 66 | 69 | ||
| 67 | void writeVersion(); | 70 | void writeVersion(); |
| @@ -125,6 +128,7 @@ namespace verbly { | |||
| 125 | std::string wordNetPath_; | 128 | std::string wordNetPath_; |
| 126 | std::string cmudictPath_; | 129 | std::string cmudictPath_; |
| 127 | std::string imageNetPath_; | 130 | std::string imageNetPath_; |
| 131 | std::string wordfreqPath_; | ||
| 128 | 132 | ||
| 129 | // Output | 133 | // Output |
| 130 | 134 | ||
| diff --git a/generator/main.cpp b/generator/main.cpp index 7db7203..7d6e4dc 100644 --- a/generator/main.cpp +++ b/generator/main.cpp | |||
| @@ -4,23 +4,24 @@ | |||
| 4 | 4 | ||
| 5 | void printUsage() | 5 | void printUsage() |
| 6 | { | 6 | { |
| 7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output ino" << std::endl; | 7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet wordfreq output ino" << std::endl; |
| 8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; | 8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; |
| 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; |
| 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; | 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; |
| 11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | 11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; |
| 12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; | 12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; |
| 13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; | ||
| 13 | std::cout << "output :: datafile output path" << std::endl; | 14 | std::cout << "output :: datafile output path" << std::endl; |
| 14 | std::cout << "ino :: imagenet directory output path" << std::endl; | 15 | std::cout << "ino :: imagenet directory output path" << std::endl; |
| 15 | } | 16 | } |
| 16 | 17 | ||
| 17 | int main(int argc, char** argv) | 18 | int main(int argc, char** argv) |
| 18 | { | 19 | { |
| 19 | if (argc == 8) | 20 | if (argc == 9) |
| 20 | { | 21 | { |
| 21 | try | 22 | try |
| 22 | { | 23 | { |
| 23 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]); | 24 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]); |
| 24 | 25 | ||
| 25 | try | 26 | try |
| 26 | { | 27 | { |
| diff --git a/generator/schema.sql b/generator/schema.sql index 8c910f4..34c6907 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
| @@ -162,7 +162,8 @@ CREATE TABLE `forms` ( | |||
| 162 | `proper` SMALLINT NOT NULL, | 162 | `proper` SMALLINT NOT NULL, |
| 163 | `length` SMALLINT NOT NULL, | 163 | `length` SMALLINT NOT NULL, |
| 164 | `anagram_set_id` INTEGER NOT NULL, | 164 | `anagram_set_id` INTEGER NOT NULL, |
| 165 | `reverse_form_id` INTEGER NOT NULL | 165 | `reverse_form_id` INTEGER NOT NULL, |
| 166 | `frequency` INTEGER | ||
| 166 | ); | 167 | ); |
| 167 | 168 | ||
| 168 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); | 169 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); |
