From a7645346293ed6a912c26d0c50b6f7943f1f3072 Mon Sep 17 00:00:00 2001 From: Kelly Rauchenberger Date: Sat, 28 Jan 2017 12:59:42 -0500 Subject: Restructured verb frame schema to be more queryable Groups are much less significant now, and they no longer have a database table, nor are they considered a top level object anymore. Instead of containing their own role data, that data is folded into the frames so that it's easier to query; as a result, each group has its own copy of the frames that it contains. Additionally, parts are considered top level objects now, and you can query for frames based on attributes of their indexed parts. Synrestrs are also contained in their own table now, so that parts can be filtered against their synrestrs; they are however not considered top level objects. Created a new type of field, the "join where" or "condition join" field, which is a normal join field that has a built in condition on a specified field. This is used to allow creating multiple distinct join fields from one object to another. This is required for the lemma::form and frame::part joins, because filters for forms of separate inflections should not be coalesced; similarly, filters on differently indexed frame parts should not be coalesced. Queries can now be ordered, ascending or descending, by a field, in addition to randomly as before. This is necessary for accessing the parts of a verb frame in the correct order, but may be useful to an end user as well. Fixed a bug with statement generation in that condition groups were not being surrounded in parentheses, which made mixing OR groups and AND groups generate inaccurate statements. This has been fixed; additionally, parentheses are not placed around the top level condition, and nested condition groups with the same logic type are coalesced, to make query strings as easy to read as possible. Also simplified the form::lemma field; it no longer conditions on the inflection of the form like the lemma::form field does. Also added a debug flag to statement::getQueryString that makes it return a query string with all of the bindings filled in, for debug use only. --- generator/frame.cpp | 69 +++----------------- generator/frame.h | 20 +++--- generator/generator.cpp | 39 +++++------- generator/generator.h | 3 +- generator/group.cpp | 166 ++++++++++++++++++++++++++++-------------------- generator/group.h | 30 ++++----- generator/part.cpp | 48 ++++++++++++++ generator/part.h | 30 ++++++--- generator/role.h | 60 +++++++++++++++++ generator/schema.sql | 33 +++++++--- generator/word.h | 1 + 11 files changed, 300 insertions(+), 199 deletions(-) create mode 100644 generator/role.h (limited to 'generator') diff --git a/generator/frame.cpp b/generator/frame.cpp index f75e3ba..4e4ac5f 100644 --- a/generator/frame.cpp +++ b/generator/frame.cpp @@ -11,72 +11,21 @@ namespace verbly { { } - void frame::push_back(part fp) - { - parts_.push_back(std::move(fp)); - } - - database& operator<<(database& db, const frame& arg) + frame frame::duplicate(const frame& other) { - std::list fields; - fields.emplace_back("frame_id", arg.getId()); + frame result; - nlohmann::json jsonParts; - for (const part& p : arg) + for (const part& p : other.parts_) { - nlohmann::json jsonPart; - jsonPart["type"] = static_cast(p.getType()); - - switch (p.getType()) - { - case part::type::noun_phrase: - { - jsonPart["role"] = p.getNounRole(); - jsonPart["selrestrs"] = p.getNounSelrestrs().toJson(); - jsonPart["synrestrs"] = p.getNounSynrestrs(); - - break; - } - - case part::type::preposition: - { - jsonPart["choices"] = p.getPrepositionChoices(); - jsonPart["literal"] = p.isPrepositionLiteral(); - - break; - } - - case part::type::literal: - { - jsonPart["value"] = p.getLiteralValue(); - - break; - } - - case part::type::verb: - case part::type::adjective: - case part::type::adverb: - { - break; - } - - case part::type::invalid: - { - // Invalid parts should not be serialized. - assert(false); - - break; - } - } - - jsonParts.push_back(std::move(jsonPart)); + result.push_back(part::duplicate(p)); } - fields.emplace_back("data", jsonParts.dump()); - - db.insertIntoTable("frames", std::move(fields)); + return result; + } - return db; + void frame::push_back(part fp) + { + parts_.push_back(std::move(fp)); } }; diff --git a/generator/frame.h b/generator/frame.h index 764564d..ba266f0 100644 --- a/generator/frame.h +++ b/generator/frame.h @@ -19,6 +19,10 @@ namespace verbly { // Constructor frame(); + + // Duplication + + static frame duplicate(const frame& other); // Mutators @@ -30,15 +34,15 @@ namespace verbly { { return id_; } - - const_iterator begin() const + + int getLength() const { - return std::begin(parts_); + return parts_.size(); } - - const_iterator end() const + + const part& operator[](int index) const { - return std::end(parts_); + return parts_.at(index); } private: @@ -47,12 +51,10 @@ namespace verbly { const int id_; - std::list parts_; + std::vector parts_; }; - database& operator<<(database& db, const frame& arg); - }; }; diff --git a/generator/generator.cpp b/generator/generator.cpp index 610a602..4cc9f64 100644 --- a/generator/generator.cpp +++ b/generator/generator.cpp @@ -8,7 +8,7 @@ #include "../lib/enums.h" #include "progress.h" #include "../lib/selrestr.h" -#include "../lib/role.h" +#include "role.h" #include "part.h" #include "field.h" #include "../lib/util.h" @@ -640,7 +640,7 @@ namespace verbly { } { - progress ppgs("Writing verb groups...", groups_.size()); + progress ppgs("Writing verb frames...", groups_.size()); for (group& g : groups_) { @@ -649,17 +649,6 @@ namespace verbly { ppgs.update(); } } - - { - progress ppgs("Writing verb frames...", frames_.size()); - - for (frame& f : frames_) - { - db_ << f; - - ppgs.update(); - } - } } void generator::readWordNetAntonymy() @@ -1212,9 +1201,15 @@ namespace verbly { return w; } - group& generator::createGroup(xmlNodePtr top) + void generator::createGroup(xmlNodePtr top, const group* parent) { - groups_.emplace_back(); + if (parent != nullptr) + { + groups_.emplace_back(*parent); + } else { + groups_.emplace_back(); + } + group& grp = groups_.back(); xmlChar* key; @@ -1229,8 +1224,11 @@ namespace verbly { { try { - group& subgrp = createGroup(subclass); - subgrp.setParent(grp); + // Parsing a subgroup starts by making a copy of everything in + // the parent. This is okay to do at this point because in the + // VerbNet data, subgroups are always defined after everything + // else. + createGroup(subclass, &grp); } catch (const std::exception& e) { key = xmlGetProp(subclass, reinterpret_cast("ID")); @@ -1323,8 +1321,7 @@ namespace verbly { { if (!xmlStrcmp(frametopnode->name, reinterpret_cast("FRAME"))) { - frames_.emplace_back(); - frame& fr = frames_.back(); + frame fr; for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next) { @@ -1428,15 +1425,13 @@ namespace verbly { } } - grp.addFrame(fr); + grp.addFrame(std::move(fr)); } } } } } } - - return grp; } selrestr generator::parseSelrestr(xmlNodePtr top) diff --git a/generator/generator.h b/generator/generator.h index 8352693..bc9b3c7 100644 --- a/generator/generator.h +++ b/generator/generator.h @@ -105,7 +105,7 @@ namespace verbly { template word& createWord(Args&&... args); - group& createGroup(xmlNodePtr top); + void createGroup(xmlNodePtr top, const group* parent = nullptr); selrestr parseSelrestr(xmlNodePtr top); @@ -128,7 +128,6 @@ namespace verbly { std::list lemmas_; std::list
forms_; std::list pronunciations_; - std::list frames_; std::list groups_; // Indexes diff --git a/generator/group.cpp b/generator/group.cpp index cebe2b9..aa28d42 100644 --- a/generator/group.cpp +++ b/generator/group.cpp @@ -15,12 +15,15 @@ namespace verbly { { } - void group::setParent(const group& parent) + group::group(const group& parent) : + id_(nextId_++), + roles_(parent.roles_), + roleNames_(parent.roleNames_) { - // Adding a group to itself is nonsensical. - assert(&parent != this); - - parent_ = &parent; + for (const frame& f : parent.frames_) + { + frames_.push_back(frame::duplicate(f)); + } } void group::addRole(role r) @@ -30,87 +33,114 @@ namespace verbly { roleNames_.insert(std::move(name)); } - void group::addFrame(const frame& f) + void group::addFrame(frame f) { - frames_.insert(&f); + frames_.push_back(std::move(f)); } - std::set group::getRoles() const + bool group::hasRole(std::string name) const { - std::set fullRoles = roleNames_; - - if (hasParent()) - { - for (std::string name : getParent().getRoles()) - { - fullRoles.insert(name); - } - } - - return fullRoles; + // Rarely, a noun phrase part may use a role that is not defined in the + // group. See confess-37.10 "NP V NP ADJ". + return (roles_.count(name) == 1); } const role& group::getRole(std::string name) const { - if (roles_.count(name)) - { - return roles_.at(name); - } else if (hasParent()) - { - return getParent().getRole(name); - } else { - throw std::invalid_argument("Specified role not found in verb group"); - } - } - - std::set group::getFrames() const - { - std::set fullFrames = frames_; - - if (hasParent()) - { - for (const frame* f : getParent().getFrames()) - { - fullFrames.insert(f); - } - } - - return fullFrames; + return roles_.at(name); } database& operator<<(database& db, const group& arg) { - // Serialize the group first + // Serialize each frame + for (const frame& f : arg.getFrames()) { - std::list fields; - fields.emplace_back("group_id", arg.getId()); - - nlohmann::json jsonRoles; - for (std::string name : arg.getRoles()) + // First, serialize the group/frame relationship { - const role& r = arg.getRole(name); + std::list fields; - nlohmann::json jsonRole; - jsonRole["type"] = name; - jsonRole["selrestrs"] = r.getSelrestrs().toJson(); + fields.emplace_back("frame_id", f.getId()); + fields.emplace_back("group_id", arg.getId()); + fields.emplace_back("length", f.getLength()); - jsonRoles.emplace_back(std::move(jsonRole)); + db.insertIntoTable("frames", std::move(fields)); } - fields.emplace_back("data", jsonRoles.dump()); - - db.insertIntoTable("groups", std::move(fields)); - } - - // Then, serialize the group/frame relationship - for (const frame* f : arg.getFrames()) - { - std::list fields; - - fields.emplace_back("group_id", arg.getId()); - fields.emplace_back("frame_id", f->getId()); - - db.insertIntoTable("groups_frames", std::move(fields)); + // Then, serialize the frame parts in the context of the group + for (int partIndex = 0; partIndex < f.getLength(); partIndex++) + { + const part& p = f[partIndex]; + + std::list fields; + fields.emplace_back("part_id", p.getId()); + fields.emplace_back("frame_id", f.getId()); + fields.emplace_back("part_index", partIndex); + fields.emplace_back("type", static_cast(p.getType())); + + switch (p.getType()) + { + case part::type::noun_phrase: + { + fields.emplace_back("role", p.getNounRole()); + + selrestr partSelrestr; + if (p.getNounSelrestrs().getType() != selrestr::type::empty) + { + partSelrestr = p.getNounSelrestrs(); + } else if (arg.hasRole(p.getNounRole())) + { + partSelrestr = arg.getRole(p.getNounRole()).getSelrestrs(); + } + + fields.emplace_back("selrestrs", partSelrestr.toJson().dump()); + + // Short interlude to serialize the synrestrs + for (const std::string& s : p.getNounSynrestrs()) + { + std::list synrestrFields; + + synrestrFields.emplace_back("part_id", p.getId()); + synrestrFields.emplace_back("synrestr", s); + + db.insertIntoTable("synrestrs", std::move(synrestrFields)); + } + + break; + } + + case part::type::preposition: + { + fields.emplace_back("prepositions", nlohmann::json(p.getPrepositionChoices()).dump()); + fields.emplace_back("preposition_literality", p.isPrepositionLiteral() ? 1 : 0); + + break; + } + + case part::type::literal: + { + fields.emplace_back("literal_value", p.getLiteralValue()); + + break; + } + + case part::type::verb: + case part::type::adjective: + case part::type::adverb: + { + break; + } + + case part::type::invalid: + { + // Invalid parts should not be serialized. + assert(false); + + break; + } + } + + db.insertIntoTable("parts", std::move(fields)); + } } return db; diff --git a/generator/group.h b/generator/group.h index 83f40c2..5486fbe 100644 --- a/generator/group.h +++ b/generator/group.h @@ -5,7 +5,7 @@ #include #include #include -#include "../lib/role.h" +#include "role.h" namespace verbly { namespace generator { @@ -20,13 +20,13 @@ namespace verbly { group(); - // Mutators + explicit group(const group& parent); - void setParent(const group& parent); + // Mutators void addRole(role r); - void addFrame(const frame& f); + void addFrame(frame f); // Accessors @@ -35,24 +35,19 @@ namespace verbly { return id_; } - bool hasParent() const - { - return (parent_ != nullptr); - } - - const group& getParent() const + const std::set& getRoles() const { - // Calling code should always call hasParent first - assert(parent_ != nullptr); - - return *parent_; + return roleNames_; } - std::set getRoles() const; + bool hasRole(std::string name) const; const role& getRole(std::string name) const; - std::set getFrames() const; + const std::list& getFrames() const + { + return frames_; + } private: @@ -60,9 +55,8 @@ namespace verbly { const int id_; - const group* parent_ = nullptr; std::map roles_; - std::set frames_; + std::list frames_; // Caches diff --git a/generator/part.cpp b/generator/part.cpp index 8a75ed4..07618a8 100644 --- a/generator/part.cpp +++ b/generator/part.cpp @@ -4,6 +4,8 @@ namespace verbly { namespace generator { + int part::nextId_ = 0; + part part::createNounPhrase(std::string role, selrestr selrestrs, std::set synrestrs) { part p(type::noun_phrase); @@ -49,9 +51,52 @@ namespace verbly { return p; } + part part::duplicate(const part& other) + { + part result(other.type_); + + switch (result.type_) + { + case type::noun_phrase: + { + new(&result.noun_phrase_.role) std::string(other.noun_phrase_.role); + new(&result.noun_phrase_.selrestrs) selrestr(other.noun_phrase_.selrestrs); + new(&result.noun_phrase_.synrestrs) std::set(other.noun_phrase_.synrestrs); + + break; + } + + case type::preposition: + { + new(&result.preposition_.choices) std::set(other.preposition_.choices); + result.preposition_.literal = other.preposition_.literal; + + break; + } + + case type::literal: + { + new(&result.literal_) std::string(other.literal_); + + break; + } + + case type::verb: + case type::adjective: + case type::adverb: + case type::invalid: + { + break; + } + } + + return result; + } + part::part(const part& other) { type_ = other.type_; + id_ = other.id_; switch (type_) { @@ -106,6 +151,7 @@ namespace verbly { using type = part::type; type tempType = first.type_; + int tempId = first.id_; std::string tempRole; selrestr tempSelrestrs; std::set tempSynrestrs; @@ -151,6 +197,7 @@ namespace verbly { first.~part(); first.type_ = second.type_; + first.id_ = second.id_; switch (first.type_) { @@ -190,6 +237,7 @@ namespace verbly { second.~part(); second.type_ = tempType; + second.id_ = tempId; switch (second.type_) { diff --git a/generator/part.h b/generator/part.h index b010f62..39ba1e7 100644 --- a/generator/part.h +++ b/generator/part.h @@ -4,21 +4,16 @@ #include #include #include "../lib/selrestr.h" +#include "../lib/enums.h" namespace verbly { + namespace generator { class part { public: - enum class type { - invalid = -1, - noun_phrase = 0, - verb = 1, - preposition = 2, - adjective = 3, - adverb = 4, - literal = 5 - }; + + using type = part_type; // Static factories @@ -34,6 +29,10 @@ namespace verbly { static part createLiteral(std::string value); + // Duplication + + static part duplicate(const part& other); + // Copy and move constructors part(const part& other); @@ -54,6 +53,11 @@ namespace verbly { // General accessors + int getId() const + { + return id_; + } + type getType() const { return type_; @@ -79,13 +83,19 @@ namespace verbly { private: + static int nextId_; + + int id_; + // Private constructors part() { } - part(type t) : type_(t) + part(type t) : + id_(nextId_++), + type_(t) { } diff --git a/generator/role.h b/generator/role.h new file mode 100644 index 0000000..4884ef3 --- /dev/null +++ b/generator/role.h @@ -0,0 +1,60 @@ +#ifndef ROLE_H_249F9A9C +#define ROLE_H_249F9A9C + +#include +#include +#include "../lib/selrestr.h" + +namespace verbly { + + class role { + public: + + // Default constructor + + role() = default; + + // Constructor + + role( + std::string name, + selrestr selrestrs = {}) : + valid_(true), + name_(name), + selrestrs_(selrestrs) + { + } + + // Accessors + + const std::string& getName() const + { + if (!valid_) + { + throw std::domain_error("Bad access to invalid role"); + } + + return name_; + } + + const selrestr& getSelrestrs() const + { + if (!valid_) + { + throw std::domain_error("Bad access to invalid role"); + } + + return selrestrs_; + } + + private: + + bool valid_ = false; + std::string name_; + selrestr selrestrs_; + + }; + +}; + +#endif /* end of include guard: ROLE_H_249F9A9C */ diff --git a/generator/schema.sql b/generator/schema.sql index c3e54d8..33ebc28 100644 --- a/generator/schema.sql +++ b/generator/schema.sql @@ -186,19 +186,32 @@ CREATE TABLE `forms_pronunciations` ( CREATE INDEX `pronunciation_of` ON `forms_pronunciations`(`form_id`); CREATE INDEX `spelling_of` ON `forms_pronunciations`(`pronunciation_id`); -CREATE TABLE `groups` ( - `group_id` INTEGER PRIMARY KEY, - `data` BLOB NOT NULL +CREATE TABLE `frames` ( + `frame_id` INTEGER NOT NULL, + `group_id` INTEGER NOT NULL, + 'length' INTEGER NOT NULL ); -CREATE TABLE `frames` ( - `frame_id` INTEGER PRIMARY KEY, - `data` BLOB NOT NULL +CREATE INDEX `frames_in` ON `frames`(`group_id`); + +CREATE TABLE `parts` ( + `part_id` INTEGER PRIMARY KEY, + `frame_id` INTEGER NOT NULL, + `part_index` INTEGER NOT NULL, + `type` INTEGER NOT NULL, + `role` VARCHAR(16), + `selrestrs` BLOB, + `prepositions` BLOB, + `preposition_literality` SMALLINT, + `literal_value` VARCHAR(64) ); -CREATE TABLE `groups_frames` ( - `group_id` INTEGER NOT NULL, - `frame_id` INTEGER NOT NULL +CREATE INDEX `parts_of` ON `parts`(`frame_id`); +CREATE UNIQUE INDEX `part_by_frame_index` ON `parts`(`frame_id`, `part_index`); + +CREATE TABLE `synrestrs` ( + `part_id` INTEGER NOT NULL, + `synrestr` VARCHAR(32) NOT NULL ); -CREATE INDEX `frames_in` ON `groups_frames`(`group_id`); +CREATE INDEX `synrestrs_for` ON `synrestrs`(`part_id`); diff --git a/generator/word.h b/generator/word.h index a994ec3..c6d7b20 100644 --- a/generator/word.h +++ b/generator/word.h @@ -5,6 +5,7 @@ #include "../lib/enums.h" namespace verbly { + namespace generator { class notion; -- cgit 1.4.1