diff options
author | Star Rauchenberger <fefferburbia@gmail.com> | 2023-02-03 09:15:28 -0500 |
---|---|---|
committer | Star Rauchenberger <fefferburbia@gmail.com> | 2023-02-03 09:16:28 -0500 |
commit | 64f26d4a3b80969e08a607f80dde87d49ad5c2e3 (patch) | |
tree | 92ab69c6550d696ac6b15e1366b54222e773ada9 /generator | |
parent | f2aacc40f4c26b3f4d71d81090f05261f4969e29 (diff) | |
download | verbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.tar.gz verbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.tar.bz2 verbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.zip |
Added word frequency information
Diffstat (limited to 'generator')
-rw-r--r-- | generator/form.cpp | 27 | ||||
-rw-r--r-- | generator/form.h | 11 | ||||
-rw-r--r-- | generator/generator.cpp | 40 | ||||
-rw-r--r-- | generator/generator.h | 4 | ||||
-rw-r--r-- | generator/main.cpp | 7 | ||||
-rw-r--r-- | generator/schema.sql | 3 |
6 files changed, 77 insertions, 15 deletions
diff --git a/generator/form.cpp b/generator/form.cpp index a88363b..460469f 100644 --- a/generator/form.cpp +++ b/generator/form.cpp | |||
@@ -28,17 +28,22 @@ namespace verbly { | |||
28 | { | 28 | { |
29 | // Serialize the form first. | 29 | // Serialize the form first. |
30 | { | 30 | { |
31 | db.insertIntoTable( | 31 | std::list<hatkirby::column> fields = { |
32 | "forms", | 32 | { "form_id", arg.getId() }, |
33 | { | 33 | { "form", arg.getText() }, |
34 | { "form_id", arg.getId() }, | 34 | { "complexity", arg.getComplexity() }, |
35 | { "form", arg.getText() }, | 35 | { "proper", arg.isProper() }, |
36 | { "complexity", arg.getComplexity() }, | 36 | { "length", arg.getLength() }, |
37 | { "proper", arg.isProper() }, | 37 | { "anagram_set_id", arg.getAnagramSetId() }, |
38 | { "length", arg.getLength() }, | 38 | { "reverse_form_id", arg.getReverseId() } |
39 | { "anagram_set_id", arg.getAnagramSetId() }, | 39 | }; |
40 | { "reverse_form_id", arg.getReverseId() } | 40 | |
41 | }); | 41 | if (arg.getFrequency() > 0) |
42 | { | ||
43 | fields.emplace_back("frequency", arg.getFrequency()); | ||
44 | } | ||
45 | |||
46 | db.insertIntoTable("forms", std::move(fields)); | ||
42 | } | 47 | } |
43 | 48 | ||
44 | // Then, serialize the form/pronunciation relationship. | 49 | // Then, serialize the form/pronunciation relationship. |
diff --git a/generator/form.h b/generator/form.h index c83bbdc..1686f9b 100644 --- a/generator/form.h +++ b/generator/form.h | |||
@@ -63,6 +63,16 @@ namespace verbly { | |||
63 | return reverse_id_; | 63 | return reverse_id_; |
64 | } | 64 | } |
65 | 65 | ||
66 | void setFrequency(int freq) | ||
67 | { | ||
68 | frequency_ = freq; | ||
69 | } | ||
70 | |||
71 | int getFrequency() const | ||
72 | { | ||
73 | return frequency_; | ||
74 | } | ||
75 | |||
66 | std::set<const pronunciation*> getPronunciations() const | 76 | std::set<const pronunciation*> getPronunciations() const |
67 | { | 77 | { |
68 | return pronunciations_; | 78 | return pronunciations_; |
@@ -79,6 +89,7 @@ namespace verbly { | |||
79 | const int length_; | 89 | const int length_; |
80 | const int anagram_set_id_; | 90 | const int anagram_set_id_; |
81 | int reverse_id_ = -1; | 91 | int reverse_id_ = -1; |
92 | int frequency_ = 0; | ||
82 | 93 | ||
83 | std::set<const pronunciation*> pronunciations_; | 94 | std::set<const pronunciation*> pronunciations_; |
84 | 95 | ||
diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp | |||
@@ -1,4 +1,5 @@ | |||
1 | #include "generator.h" | 1 | #include "generator.h" |
2 | #include <cstdlib> | ||
2 | #include <stdexcept> | 3 | #include <stdexcept> |
3 | #include <iostream> | 4 | #include <iostream> |
4 | #include <regex> | 5 | #include <regex> |
@@ -23,6 +24,7 @@ namespace verbly { | |||
23 | std::string wordNetPath, | 24 | std::string wordNetPath, |
24 | std::string cmudictPath, | 25 | std::string cmudictPath, |
25 | std::string imageNetPath, | 26 | std::string imageNetPath, |
27 | std::string wordfreqPath, | ||
26 | std::string outputPath, | 28 | std::string outputPath, |
27 | std::string imageNetOutput) : | 29 | std::string imageNetOutput) : |
28 | verbNetPath_(verbNetPath), | 30 | verbNetPath_(verbNetPath), |
@@ -30,6 +32,7 @@ namespace verbly { | |||
30 | wordNetPath_(wordNetPath), | 32 | wordNetPath_(wordNetPath), |
31 | cmudictPath_(cmudictPath), | 33 | cmudictPath_(cmudictPath), |
32 | imageNetPath_(imageNetPath), | 34 | imageNetPath_(imageNetPath), |
35 | wordfreqPath_(wordfreqPath), | ||
33 | db_(outputPath, hatkirby::dbmode::create), | 36 | db_(outputPath, hatkirby::dbmode::create), |
34 | imageNetOutput_(imageNetOutput) | 37 | imageNetOutput_(imageNetOutput) |
35 | { | 38 | { |
@@ -117,6 +120,10 @@ namespace verbly { | |||
117 | // and then only generate pronunciations for already-exisiting forms. | 120 | // and then only generate pronunciations for already-exisiting forms. |
118 | readCmudictPronunciations(); | 121 | readCmudictPronunciations(); |
119 | 122 | ||
123 | // Reads word (really form) frequency information from a corpus, formatted | ||
124 | // as a CSV file with the form in one column and the frequency in another. | ||
125 | readWordFrequency(); | ||
126 | |||
120 | // Writes the database schema | 127 | // Writes the database schema |
121 | writeSchema(); | 128 | writeSchema(); |
122 | 129 | ||
@@ -624,6 +631,39 @@ namespace verbly { | |||
624 | } | 631 | } |
625 | } | 632 | } |
626 | 633 | ||
634 | void generator::readWordFrequency() | ||
635 | { | ||
636 | std::list<std::string> lines(readFile(wordfreqPath_)); | ||
637 | |||
638 | hatkirby::progress ppgs( | ||
639 | "Reading word frequencies...", | ||
640 | lines.size()); | ||
641 | |||
642 | for (std::string line : lines) | ||
643 | { | ||
644 | ppgs.update(); | ||
645 | |||
646 | std::regex freqline("([a-z]+),([0-9]+)"); | ||
647 | std::smatch freqline_data; | ||
648 | if (std::regex_search(line, freqline_data, freqline)) | ||
649 | { | ||
650 | std::string text = freqline_data[1]; | ||
651 | |||
652 | if (!formByText_.count(text)) | ||
653 | { | ||
654 | continue; | ||
655 | } | ||
656 | |||
657 | std::string freqnumstr = freqline_data[2]; | ||
658 | long long freqnumnum = std::atoll(freqnumstr.c_str()); | ||
659 | formByText_[text]->setFrequency( | ||
660 | freqnumnum > std::numeric_limits<int>::max() | ||
661 | ? std::numeric_limits<int>::max() | ||
662 | : freqnumnum); | ||
663 | } | ||
664 | } | ||
665 | } | ||
666 | |||
627 | void generator::writeSchema() | 667 | void generator::writeSchema() |
628 | { | 668 | { |
629 | std::ifstream file("schema.sql"); | 669 | std::ifstream file("schema.sql"); |
diff --git a/generator/generator.h b/generator/generator.h index 3d51c35..82dec66 100644 --- a/generator/generator.h +++ b/generator/generator.h | |||
@@ -35,6 +35,7 @@ namespace verbly { | |||
35 | std::string wordNetPath, | 35 | std::string wordNetPath, |
36 | std::string cmudictPath, | 36 | std::string cmudictPath, |
37 | std::string imageNetPath, | 37 | std::string imageNetPath, |
38 | std::string wordfreqPath, | ||
38 | std::string outputPath, | 39 | std::string outputPath, |
39 | std::string imageNetOutput); | 40 | std::string imageNetOutput); |
40 | 41 | ||
@@ -62,6 +63,8 @@ namespace verbly { | |||
62 | 63 | ||
63 | void readCmudictPronunciations(); | 64 | void readCmudictPronunciations(); |
64 | 65 | ||
66 | void readWordFrequency(); | ||
67 | |||
65 | void writeSchema(); | 68 | void writeSchema(); |
66 | 69 | ||
67 | void writeVersion(); | 70 | void writeVersion(); |
@@ -125,6 +128,7 @@ namespace verbly { | |||
125 | std::string wordNetPath_; | 128 | std::string wordNetPath_; |
126 | std::string cmudictPath_; | 129 | std::string cmudictPath_; |
127 | std::string imageNetPath_; | 130 | std::string imageNetPath_; |
131 | std::string wordfreqPath_; | ||
128 | 132 | ||
129 | // Output | 133 | // Output |
130 | 134 | ||
diff --git a/generator/main.cpp b/generator/main.cpp index 7db7203..7d6e4dc 100644 --- a/generator/main.cpp +++ b/generator/main.cpp | |||
@@ -4,23 +4,24 @@ | |||
4 | 4 | ||
5 | void printUsage() | 5 | void printUsage() |
6 | { | 6 | { |
7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output ino" << std::endl; | 7 | std::cout << "usage: generator verbnet agid wordnet cmudict imagenet wordfreq output ino" << std::endl; |
8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; | 8 | std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; |
9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; | 9 | std::cout << "agid :: path to an AGID infl.txt file" << std::endl; |
10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; | 10 | std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; |
11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; | 11 | std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; |
12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; | 12 | std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; |
13 | std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl; | ||
13 | std::cout << "output :: datafile output path" << std::endl; | 14 | std::cout << "output :: datafile output path" << std::endl; |
14 | std::cout << "ino :: imagenet directory output path" << std::endl; | 15 | std::cout << "ino :: imagenet directory output path" << std::endl; |
15 | } | 16 | } |
16 | 17 | ||
17 | int main(int argc, char** argv) | 18 | int main(int argc, char** argv) |
18 | { | 19 | { |
19 | if (argc == 8) | 20 | if (argc == 9) |
20 | { | 21 | { |
21 | try | 22 | try |
22 | { | 23 | { |
23 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]); | 24 | verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]); |
24 | 25 | ||
25 | try | 26 | try |
26 | { | 27 | { |
diff --git a/generator/schema.sql b/generator/schema.sql index 8c910f4..34c6907 100644 --- a/generator/schema.sql +++ b/generator/schema.sql | |||
@@ -162,7 +162,8 @@ CREATE TABLE `forms` ( | |||
162 | `proper` SMALLINT NOT NULL, | 162 | `proper` SMALLINT NOT NULL, |
163 | `length` SMALLINT NOT NULL, | 163 | `length` SMALLINT NOT NULL, |
164 | `anagram_set_id` INTEGER NOT NULL, | 164 | `anagram_set_id` INTEGER NOT NULL, |
165 | `reverse_form_id` INTEGER NOT NULL | 165 | `reverse_form_id` INTEGER NOT NULL, |
166 | `frequency` INTEGER | ||
166 | ); | 167 | ); |
167 | 168 | ||
168 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); | 169 | CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); |