summary refs log tree commit diff stats
path: root/generator
diff options
context:
space:
mode:
authorStar Rauchenberger <fefferburbia@gmail.com>2023-02-03 09:15:28 -0500
committerStar Rauchenberger <fefferburbia@gmail.com>2023-02-03 09:16:28 -0500
commit64f26d4a3b80969e08a607f80dde87d49ad5c2e3 (patch)
tree92ab69c6550d696ac6b15e1366b54222e773ada9 /generator
parentf2aacc40f4c26b3f4d71d81090f05261f4969e29 (diff)
downloadverbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.tar.gz
verbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.tar.bz2
verbly-64f26d4a3b80969e08a607f80dde87d49ad5c2e3.zip
Added word frequency information
Diffstat (limited to 'generator')
-rw-r--r--generator/form.cpp27
-rw-r--r--generator/form.h11
-rw-r--r--generator/generator.cpp40
-rw-r--r--generator/generator.h4
-rw-r--r--generator/main.cpp7
-rw-r--r--generator/schema.sql3
6 files changed, 77 insertions, 15 deletions
diff --git a/generator/form.cpp b/generator/form.cpp index a88363b..460469f 100644 --- a/generator/form.cpp +++ b/generator/form.cpp
@@ -28,17 +28,22 @@ namespace verbly {
28 { 28 {
29 // Serialize the form first. 29 // Serialize the form first.
30 { 30 {
31 db.insertIntoTable( 31 std::list<hatkirby::column> fields = {
32 "forms", 32 { "form_id", arg.getId() },
33 { 33 { "form", arg.getText() },
34 { "form_id", arg.getId() }, 34 { "complexity", arg.getComplexity() },
35 { "form", arg.getText() }, 35 { "proper", arg.isProper() },
36 { "complexity", arg.getComplexity() }, 36 { "length", arg.getLength() },
37 { "proper", arg.isProper() }, 37 { "anagram_set_id", arg.getAnagramSetId() },
38 { "length", arg.getLength() }, 38 { "reverse_form_id", arg.getReverseId() }
39 { "anagram_set_id", arg.getAnagramSetId() }, 39 };
40 { "reverse_form_id", arg.getReverseId() } 40
41 }); 41 if (arg.getFrequency() > 0)
42 {
43 fields.emplace_back("frequency", arg.getFrequency());
44 }
45
46 db.insertIntoTable("forms", std::move(fields));
42 } 47 }
43 48
44 // Then, serialize the form/pronunciation relationship. 49 // Then, serialize the form/pronunciation relationship.
diff --git a/generator/form.h b/generator/form.h index c83bbdc..1686f9b 100644 --- a/generator/form.h +++ b/generator/form.h
@@ -63,6 +63,16 @@ namespace verbly {
63 return reverse_id_; 63 return reverse_id_;
64 } 64 }
65 65
66 void setFrequency(int freq)
67 {
68 frequency_ = freq;
69 }
70
71 int getFrequency() const
72 {
73 return frequency_;
74 }
75
66 std::set<const pronunciation*> getPronunciations() const 76 std::set<const pronunciation*> getPronunciations() const
67 { 77 {
68 return pronunciations_; 78 return pronunciations_;
@@ -79,6 +89,7 @@ namespace verbly {
79 const int length_; 89 const int length_;
80 const int anagram_set_id_; 90 const int anagram_set_id_;
81 int reverse_id_ = -1; 91 int reverse_id_ = -1;
92 int frequency_ = 0;
82 93
83 std::set<const pronunciation*> pronunciations_; 94 std::set<const pronunciation*> pronunciations_;
84 95
diff --git a/generator/generator.cpp b/generator/generator.cpp index 21d0a63..897ccab 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp
@@ -1,4 +1,5 @@
1#include "generator.h" 1#include "generator.h"
2#include <cstdlib>
2#include <stdexcept> 3#include <stdexcept>
3#include <iostream> 4#include <iostream>
4#include <regex> 5#include <regex>
@@ -23,6 +24,7 @@ namespace verbly {
23 std::string wordNetPath, 24 std::string wordNetPath,
24 std::string cmudictPath, 25 std::string cmudictPath,
25 std::string imageNetPath, 26 std::string imageNetPath,
27 std::string wordfreqPath,
26 std::string outputPath, 28 std::string outputPath,
27 std::string imageNetOutput) : 29 std::string imageNetOutput) :
28 verbNetPath_(verbNetPath), 30 verbNetPath_(verbNetPath),
@@ -30,6 +32,7 @@ namespace verbly {
30 wordNetPath_(wordNetPath), 32 wordNetPath_(wordNetPath),
31 cmudictPath_(cmudictPath), 33 cmudictPath_(cmudictPath),
32 imageNetPath_(imageNetPath), 34 imageNetPath_(imageNetPath),
35 wordfreqPath_(wordfreqPath),
33 db_(outputPath, hatkirby::dbmode::create), 36 db_(outputPath, hatkirby::dbmode::create),
34 imageNetOutput_(imageNetOutput) 37 imageNetOutput_(imageNetOutput)
35 { 38 {
@@ -117,6 +120,10 @@ namespace verbly {
117 // and then only generate pronunciations for already-exisiting forms. 120 // and then only generate pronunciations for already-exisiting forms.
118 readCmudictPronunciations(); 121 readCmudictPronunciations();
119 122
123 // Reads word (really form) frequency information from a corpus, formatted
124 // as a CSV file with the form in one column and the frequency in another.
125 readWordFrequency();
126
120 // Writes the database schema 127 // Writes the database schema
121 writeSchema(); 128 writeSchema();
122 129
@@ -624,6 +631,39 @@ namespace verbly {
624 } 631 }
625 } 632 }
626 633
634 void generator::readWordFrequency()
635 {
636 std::list<std::string> lines(readFile(wordfreqPath_));
637
638 hatkirby::progress ppgs(
639 "Reading word frequencies...",
640 lines.size());
641
642 for (std::string line : lines)
643 {
644 ppgs.update();
645
646 std::regex freqline("([a-z]+),([0-9]+)");
647 std::smatch freqline_data;
648 if (std::regex_search(line, freqline_data, freqline))
649 {
650 std::string text = freqline_data[1];
651
652 if (!formByText_.count(text))
653 {
654 continue;
655 }
656
657 std::string freqnumstr = freqline_data[2];
658 long long freqnumnum = std::atoll(freqnumstr.c_str());
659 formByText_[text]->setFrequency(
660 freqnumnum > std::numeric_limits<int>::max()
661 ? std::numeric_limits<int>::max()
662 : freqnumnum);
663 }
664 }
665 }
666
627 void generator::writeSchema() 667 void generator::writeSchema()
628 { 668 {
629 std::ifstream file("schema.sql"); 669 std::ifstream file("schema.sql");
diff --git a/generator/generator.h b/generator/generator.h index 3d51c35..82dec66 100644 --- a/generator/generator.h +++ b/generator/generator.h
@@ -35,6 +35,7 @@ namespace verbly {
35 std::string wordNetPath, 35 std::string wordNetPath,
36 std::string cmudictPath, 36 std::string cmudictPath,
37 std::string imageNetPath, 37 std::string imageNetPath,
38 std::string wordfreqPath,
38 std::string outputPath, 39 std::string outputPath,
39 std::string imageNetOutput); 40 std::string imageNetOutput);
40 41
@@ -62,6 +63,8 @@ namespace verbly {
62 63
63 void readCmudictPronunciations(); 64 void readCmudictPronunciations();
64 65
66 void readWordFrequency();
67
65 void writeSchema(); 68 void writeSchema();
66 69
67 void writeVersion(); 70 void writeVersion();
@@ -125,6 +128,7 @@ namespace verbly {
125 std::string wordNetPath_; 128 std::string wordNetPath_;
126 std::string cmudictPath_; 129 std::string cmudictPath_;
127 std::string imageNetPath_; 130 std::string imageNetPath_;
131 std::string wordfreqPath_;
128 132
129 // Output 133 // Output
130 134
diff --git a/generator/main.cpp b/generator/main.cpp index 7db7203..7d6e4dc 100644 --- a/generator/main.cpp +++ b/generator/main.cpp
@@ -4,23 +4,24 @@
4 4
5void printUsage() 5void printUsage()
6{ 6{
7 std::cout << "usage: generator verbnet agid wordnet cmudict imagenet output ino" << std::endl; 7 std::cout << "usage: generator verbnet agid wordnet cmudict imagenet wordfreq output ino" << std::endl;
8 std::cout << "verbnet :: path to a VerbNet data directory" << std::endl; 8 std::cout << "verbnet :: path to a VerbNet data directory" << std::endl;
9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl; 9 std::cout << "agid :: path to an AGID infl.txt file" << std::endl;
10 std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl; 10 std::cout << "wordnet :: path to a WordNet prolog data directory" << std::endl;
11 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl; 11 std::cout << "cmudict :: path to a CMUDICT pronunciation file" << std::endl;
12 std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl; 12 std::cout << "imagenet :: path to an ImageNet urls.txt file" << std::endl;
13 std::cout << "wordfreq :: path to a word frequency CSV file" << std::endl;
13 std::cout << "output :: datafile output path" << std::endl; 14 std::cout << "output :: datafile output path" << std::endl;
14 std::cout << "ino :: imagenet directory output path" << std::endl; 15 std::cout << "ino :: imagenet directory output path" << std::endl;
15} 16}
16 17
17int main(int argc, char** argv) 18int main(int argc, char** argv)
18{ 19{
19 if (argc == 8) 20 if (argc == 9)
20 { 21 {
21 try 22 try
22 { 23 {
23 verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]); 24 verbly::generator::generator app(argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]);
24 25
25 try 26 try
26 { 27 {
diff --git a/generator/schema.sql b/generator/schema.sql index 8c910f4..34c6907 100644 --- a/generator/schema.sql +++ b/generator/schema.sql
@@ -162,7 +162,8 @@ CREATE TABLE `forms` (
162 `proper` SMALLINT NOT NULL, 162 `proper` SMALLINT NOT NULL,
163 `length` SMALLINT NOT NULL, 163 `length` SMALLINT NOT NULL,
164 `anagram_set_id` INTEGER NOT NULL, 164 `anagram_set_id` INTEGER NOT NULL,
165 `reverse_form_id` INTEGER NOT NULL 165 `reverse_form_id` INTEGER NOT NULL,
166 `frequency` INTEGER
166); 167);
167 168
168CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`); 169CREATE UNIQUE INDEX `form_by_string` ON `forms`(`form`);