Fixed generator ignoring multiple inflection variants

Previously, the generator would recognize at most one form per inflection per lemma; now, the generator adds all variants in AGID to the database.
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2017-01-22 18:43:25 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2017-01-22 18:43:25 -0500
commit: 2c5ed155e8951f7f28b82ed7570295d5629f4770 (patch)
tree: 889b807b6e76ec745f19488be7700cfc574aa9fa /generator/generator.cpp
parent: 8837d3ceb8358476f20d71948a1fc9bc91480d57 (diff)
download: verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.tar.gz
verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.tar.bz2
verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.zip
1 files changed, 268 insertions, 257 deletions
diff --git a/generator/generator.cpp b/generator/generator.cpp
index 3dd2ce7..610a602 100644
--- a/generator/generator.cpp
+++ b/generator/generator.cpp

@@ -15,7 +15,7 @@
 namespace verbly {
  namespace generator {
-    
    generator::generator(
      std::string verbNetPath,
      std::string agidPath,
@@ -36,21 +36,21 @@ namespace verbly {
      {
        throw std::invalid_argument("Invalid VerbNet data directory");
      }
-      
      closedir(dir);
-      
      // Ensure AGID infl.txt exists
      if (!std::ifstream(agidPath_))
      {
        throw std::invalid_argument("AGID infl.txt file not found");
      }
-      
      // Add directory separator to WordNet path
      if ((wordNetPath_.back() != '/') && (wordNetPath_.back() != '\\'))
      {
        wordNetPath_ += '/';
      }
-      
      // Ensure WordNet tables exist
      for (std::string table : {
        "s", "sk", "ant", "at", "cls", "hyp", "ins", "mm", "mp", "ms", "per", "sa", "sim", "syntax"
@@ -61,37 +61,37 @@ namespace verbly {
          throw std::invalid_argument("WordNet " + table + " table not found");
        }
      }
-      
      // Ensure CMUDICT file exists
      if (!std::ifstream(cmudictPath_))
      {
        throw std::invalid_argument("CMUDICT file not found");
      }
-      
      // Ensure ImageNet urls.txt exists
      if (!std::ifstream(imageNetPath_))
      {
        throw std::invalid_argument("ImageNet urls.txt file not found");
      }
    }
-    
    void generator::run()
    {
      // Create notions, words, lemmas, and forms from WordNet synsets
      readWordNetSynsets();
-      
      // Reads adjective positioning WordNet data
      readAdjectivePositioning();
-      
-      // Counts the number of URLs ImageNet has per notion 
+      // Counts the number of URLs ImageNet has per notion
      readImageNetUrls();
-      
      // Creates a word by WordNet sense key lookup table
      readWordNetSenseKeys();
-      
      // Creates groups and frames from VerbNet data
      readVerbNet();
-      
      // Creates forms and inflections from AGID. To reduce the amount of forms
      // created, we do this after most lemmas that need inflecting have been
      // created through other means, and then only generate forms for
@@ -101,86 +101,78 @@ namespace verbly {
      // then a notion and a word is generated and the form generation proceeds
      // as usual.
      readAgidInflections();
-      
      // Reads in prepositions and the is_a relationship
      readPrepositions();
-      
      // Creates pronunciations from CMUDICT. To reduce the amount of
      // pronunciations created, we do this after all forms have been created,
      // and then only generate pronunciations for already-exisiting forms.
      readCmudictPronunciations();
-      
      // Writes the database schema
      writeSchema();
-      
      // Dumps data to the database
      dumpObjects();
-      
      // Populates the antonymy relationship from WordNet
      readWordNetAntonymy();
-      
      // Populates the variation relationship from WordNet
      readWordNetVariation();
-      
      // Populates the usage, topicality, and regionality relationships from
      // WordNet
      readWordNetClasses();
-      
      // Populates the causality relationship from WordNet
      readWordNetCausality();
-      
      // Populates the entailment relationship from WordNet
      readWordNetEntailment();
-      
      // Populates the hypernymy relationship from WordNet
      readWordNetHypernymy();
-      
      // Populates the instantiation relationship from WordNet
      readWordNetInstantiation();
-      
      // Populates the member meronymy relationship from WordNet
      readWordNetMemberMeronymy();
-      
      // Populates the part meronymy relationship from WordNet
      readWordNetPartMeronymy();
-      
      // Populates the substance meronymy relationship from WordNet
      readWordNetSubstanceMeronymy();
-      
      // Populates the pertainymy and mannernymy relationships from WordNet
      readWordNetPertainymy();
-      
      // Populates the specification relationship from WordNet
      readWordNetSpecification();
-      
      // Populates the adjective similarity relationship from WordNet
      readWordNetSimilarity();
-      
-      
-      
-      
-      
-      
-      
-      
    }
-    
    void generator::readWordNetSynsets()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_s.pl"));
      progress ppgs("Reading synsets from WordNet...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        std::regex relation("^s\\(([1234]\\d{8}),(\\d+),'(.+)',\\w,\\d+,(\\d+)\\)\\.$");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
        {
          continue;
        }
-    
        int synset_id = std::stoi(relation_data[1]);
        int wnum = std::stoi(relation_data[2]);
        std::string text = relation_data[3];
@@ -190,7 +182,7 @@ namespace verbly {
        {
          text.erase(word_it, 1);
        }
-        
        // The WordNet data does contain duplicates, so we need to check that we
        // haven't already created this word.
        std::pair<int, int> lookup(synset_id, wnum);
@@ -204,32 +196,32 @@ namespace verbly {
        }
      }
    }
-    
    void generator::readAdjectivePositioning()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_syntax.pl"));
      progress ppgs("Reading adjective positionings from WordNet...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        std::regex relation("^syntax\\((3\\d{8}),(\\d+),([ipa])p?\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
        {
          continue;
        }
-      
        int synset_id = stoi(relation_data[1]);
        int wnum = stoi(relation_data[2]);
        std::string adjpos_str = relation_data[3];
-        
        std::pair<int, int> lookup(synset_id, wnum);
        if (wordByWnidAndWnum_.count(lookup))
        {
          word& adj = *wordByWnidAndWnum_.at(lookup);
-          
          if (adjpos_str == "p")
          {
            adj.setAdjectivePosition(positioning::predicate);
@@ -246,20 +238,20 @@ namespace verbly {
        }
      }
    }
-    
    void generator::readImageNetUrls()
    {
      // The ImageNet datafile is so large that it is unreasonable and
      // unnecessary to read it into memory; instead, we will parse each line as
      // we read it. This has the caveat that we cannot display a progress bar.
      std::cout << "Reading image counts from ImageNet..." << std::endl;
-      
      std::ifstream file(imageNetPath_);
      if (!file)
      {
        throw std::invalid_argument("Could not find file " + imageNetPath_);
      }
-  
      std::string line;
      while (std::getline(file, line))
      {
@@ -267,7 +259,7 @@ namespace verbly {
        {
          line.pop_back();
        }
-      
        std::string wnid_s = line.substr(1, 8);
        int wnid = stoi(wnid_s) + 100000000;
        if (notionByWnid_.count(wnid))
@@ -277,16 +269,16 @@ namespace verbly {
        }
      }
    }
-    
    void generator::readWordNetSenseKeys()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_sk.pl"));
      progress ppgs("Reading sense keys from WordNet...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        // We only actually need to lookup verbs by sense key so we'll just
        // ignore everything that isn't a verb.
        std::regex relation("^sk\\((2\\d{8}),(\\d+),'(.+)'\\)\\.$");
@@ -295,11 +287,11 @@ namespace verbly {
        {
          continue;
        }
-    
        int synset_id = stoi(relation_data[1]);
        int wnum = stoi(relation_data[2]);
        std::string sense_key = relation_data[3];
-        
        // We are treating this mapping as injective, which is not entirely
        // accurate. First, the WordNet table contains duplicate rows, so those
        // need to be ignored. More importantly, a small number of sense keys
@@ -317,17 +309,17 @@ namespace verbly {
        }
      }
    }
-    
    void generator::readVerbNet()
    {
      std::cout << "Reading frames from VerbNet..." << std::endl;
-  
      DIR* dir;
      if ((dir = opendir(verbNetPath_.c_str())) == nullptr)
      {
        throw std::invalid_argument("Invalid VerbNet data directory");
      }
-  
      struct dirent* ent;
      while ((ent = readdir(dir)) != nullptr)
      {
@@ -337,20 +329,20 @@ namespace verbly {
        {
          filename += '/';
        }
-    
        filename += ent->d_name;
-    
        if (filename.rfind(".xml") != filename.size() - 4)
        {
          continue;
        }
-    
        xmlDocPtr doc = xmlParseFile(filename.c_str());
        if (doc == nullptr)
        {
          throw std::logic_error("Error opening " + filename);
        }
-    
        xmlNodePtr top = xmlDocGetRootElement(doc);
        if ((top == nullptr) || (xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("VNCLASS"))))
        {
@@ -365,71 +357,81 @@ namespace verbly {
          std::throw_with_nested(std::logic_error("Error parsing VerbNet file: " + filename));
        }
      }
-  
      closedir(dir);
    }
-    
    void generator::readAgidInflections()
    {
      std::list<std::string> lines(readFile(agidPath_));
      progress ppgs("Reading inflections from AGID...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        int divider = line.find_first_of(" ");
        std::string infinitive = line.substr(0, divider);
        line = line.substr(divider+1);
        char type = line[0];
-    
        if (line[1] == '?')
        {
          line.erase(0, 4);
        } else {
          line.erase(0, 3);
        }
-        
        if (!lemmaByBaseForm_.count(infinitive) && (type != 'V'))
        {
          continue;
        }
-        
        lemma& curLemma = lookupOrCreateLemma(infinitive);
-        auto forms = split<std::vector<std::string>>(line, " | ");
+        std::vector<std::list<std::string>> agidForms;
-        for (std::string& inflForm : forms)
+        for (std::string inflForms : split<std::list<std::string>>(line, " | "))
        {
-          int sympos = inflForm.find_first_of(",?");
+          std::list<std::string> forms;
-          if (sympos != std::string::npos)
+          for (std::string inflForm : split<std::list<std::string>>(std::move(inflForms), ", "))
          {
-            inflForm = inflForm.substr(0, sympos);
+            int sympos = inflForm.find_first_of("~<!? ");
+            if (sympos != std::string::npos)
+            {
+              inflForm = inflForm.substr(0, sympos);
+            }
+            forms.push_back(std::move(inflForm));
          }
+          agidForms.push_back(std::move(forms));
        }
-    
+        std::map<inflection, std::list<std::string>> mappedForms;
        switch (type)
        {
          case 'V':
          {
-            if (forms.size() == 4)
+            if (agidForms.size() == 4)
            {
-              curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::past_tense] = agidForms[0];
-              curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[1]));
+              mappedForms[inflection::past_participle] = agidForms[1];
-              curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[2]));
+              mappedForms[inflection::ing_form] = agidForms[2];
-              curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[3]));
+              mappedForms[inflection::s_form] = agidForms[3];
-            } else if (forms.size() == 3)
+            } else if (agidForms.size() == 3)
            {
-              curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::past_tense] = agidForms[0];
-              curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::past_participle] = agidForms[0];
-              curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[1]));
+              mappedForms[inflection::ing_form] = agidForms[1];
-              curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[2]));
+              mappedForms[inflection::s_form] = agidForms[2];
-            } else if (forms.size() == 8)
+            } else if (agidForms.size() == 8)
            {
              // As of AGID 2014.08.11, this is only "to be"
-              curLemma.addInflection(inflection::past_tense, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::past_tense] = agidForms[0];
-              curLemma.addInflection(inflection::past_participle, lookupOrCreateForm(forms[2]));
+              mappedForms[inflection::past_participle] = agidForms[2];
-              curLemma.addInflection(inflection::ing_form, lookupOrCreateForm(forms[3]));
+              mappedForms[inflection::ing_form] = agidForms[3];
-              curLemma.addInflection(inflection::s_form, lookupOrCreateForm(forms[4]));
+              mappedForms[inflection::s_form] = agidForms[4];
            } else {
              // Words that don't fit the cases above as of AGID 2014.08.11:
              // - may and shall do not conjugate the way we want them to
@@ -437,7 +439,7 @@ namespace verbly {
              // - wit has five forms, and is archaic/obscure enough that we can ignore it for now
              std::cout << " Ignoring verb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
            }
-    
            // For verbs in particular, we sometimes create a notion and a word
            // from inflection data. Specifically, if there are not yet any
            // verbs existing that have the same infinitive form. "Yet" means
@@ -451,84 +453,93 @@ namespace verbly {
              notion& n = createNotion(part_of_speech::verb);
              createWord(n, curLemma);
            }
-        
            break;
          }
-      
          case 'A':
          {
-            if (forms.size() == 2)
+            if (agidForms.size() == 2)
            {
-              curLemma.addInflection(inflection::comparative, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::comparative] = agidForms[0];
-              curLemma.addInflection(inflection::superlative, lookupOrCreateForm(forms[1]));
+              mappedForms[inflection::superlative] = agidForms[1];
            } else {
              // As of AGID 2014.08.11, this is only "only", which has only the form "onliest"
              std::cout << " Ignoring adjective/adverb \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
            }
-        
            break;
          }
-      
          case 'N':
          {
-            if (forms.size() == 1)
+            if (agidForms.size() == 1)
            {
-              curLemma.addInflection(inflection::plural, lookupOrCreateForm(forms[0]));
+              mappedForms[inflection::plural] = agidForms[0];
            } else {
              // As of AGID 2014.08.11, this is non-existent.
              std::cout << " Ignoring noun \"" << infinitive << "\" due to non-standard number of forms." << std::endl;
            }
-        
            break;
          }
        }
+        // Compile the forms we have mapped.
+        for (auto mapping : std::move(mappedForms))
+        {
+          for (std::string infl : std::move(mapping.second))
+          {
+            curLemma.addInflection(mapping.first, lookupOrCreateForm(std::move(infl)));
+          }
+        }
      }
    }
-    
    void generator::readPrepositions()
    {
      std::list<std::string> lines(readFile("prepositions.txt"));
      progress ppgs("Reading prepositions...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        std::regex relation("^([^:]+): (.+)");
        std::smatch relation_data;
        std::regex_search(line, relation_data, relation);
        std::string prep = relation_data[1];
        auto groups = split<std::list<std::string>>(relation_data[2], ", ");
-        
        notion& n = createNotion(part_of_speech::preposition);
        lemma& l = lookupOrCreateLemma(prep);
        word& w = createWord(n, l);
-        
        n.setPrepositionGroups(groups);
      }
    }
-    
    void generator::readCmudictPronunciations()
    {
      std::list<std::string> lines(readFile(cmudictPath_));
      progress ppgs("Reading pronunciations from CMUDICT...", lines.size());
-      
      for (std::string line : lines)
      {
        ppgs.update();
-        
        std::regex phoneme("([A-Z][^ \\(]*)(?:\\(\\d+\\))?  ([A-Z 0-9]+)");
        std::smatch phoneme_data;
        if (std::regex_search(line, phoneme_data, phoneme))
        {
          std::string canonical(phoneme_data[1]);
          std::transform(std::begin(canonical), std::end(canonical), std::begin(canonical), ::tolower);
-          
          if (!formByText_.count(canonical))
          {
            continue;
          }
-          
          std::string phonemes = phoneme_data[2];
          pronunciations_.emplace_back(phonemes);
          pronunciation& p = pronunciations_.back();
@@ -536,7 +547,7 @@ namespace verbly {
        }
      }
    }
-    
    void generator::writeSchema()
    {
      std::ifstream file("schema.sql");
@@ -544,7 +555,7 @@ namespace verbly {
      {
        throw std::invalid_argument("Could not find database schema");
      }
-  
      std::ostringstream schemaBuilder;
      std::string line;
      while (std::getline(file, line))
@@ -553,10 +564,10 @@ namespace verbly {
        {
          line.pop_back();
        }
-      
        schemaBuilder << line;
      }
-      
      std::string schema = schemaBuilder.str();
      auto queries = split<std::list<std::string>>(schema, ";");
      progress ppgs("Writing database schema...", queries.size());
@@ -566,91 +577,91 @@ namespace verbly {
        {
          db_.runQuery(query);
        }
-        
        ppgs.update();
      }
    }
-    
    void generator::dumpObjects()
    {
      {
        progress ppgs("Writing notions...", notions_.size());
-        
        for (notion& n : notions_)
        {
          db_ << n;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing words...", words_.size());
-        
        for (word& w : words_)
        {
          db_ << w;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing lemmas...", lemmas_.size());
-        
        for (lemma& l : lemmas_)
        {
          db_ << l;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing forms...", forms_.size());
-        
        for (form& f : forms_)
        {
          db_ << f;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing pronunciations...", pronunciations_.size());
-        
        for (pronunciation& p : pronunciations_)
        {
          db_ << p;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing verb groups...", groups_.size());
-        
        for (group& g : groups_)
        {
          db_ << g;
-          
          ppgs.update();
        }
      }
-      
      {
        progress ppgs("Writing verb frames...", frames_.size());
-        
        for (frame& f : frames_)
        {
          db_ << f;
-          
          ppgs.update();
        }
      }
    }
-    
    void generator::readWordNetAntonymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_ant.pl"));
@@ -658,7 +669,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^ant\\(([134]\\d{8}),(\\d+),([134]\\d{8}),(\\d+)\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -668,21 +679,21 @@ namespace verbly {
        std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
        std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
-        
        if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
        {
          word& word1 = *wordByWnidAndWnum_.at(lookup1);
          word& word2 = *wordByWnidAndWnum_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("antonym_1_id", word1.getId());
          fields.emplace_back("antonym_2_id", word2.getId());
-          
          db_.insertIntoTable("antonymy", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetVariation()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_at.pl"));
@@ -690,7 +701,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^at\\((1\\d{8}),(3\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -700,21 +711,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("noun_id", notion1.getId());
          fields.emplace_back("adjective_id", notion2.getId());
-          
          db_.insertIntoTable("variation", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetClasses()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_cls.pl"));
@@ -722,7 +733,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^cls\\(([134]\\d{8}),(\\d+),(1\\d{8}),(\\d+),([tur])\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -733,7 +744,7 @@ namespace verbly {
        std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
        std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
        std::string class_type = relation_data[5];
-        
        std::string table_name;
        if (class_type == "t")
        {
@@ -745,10 +756,10 @@ namespace verbly {
        {
          table_name += "regionality";
        }
-        
        std::list<int> leftJoin;
        std::list<int> rightJoin;
-        
        if ((lookup1.second == 0) && (wordsByWnid_.count(lookup1.first)))
        {
          std::transform(std::begin(wordsByWnid_.at(lookup1.first)), std::end(wordsByWnid_.at(lookup1.first)), std::back_inserter(leftJoin), [] (word* w) {
@@ -757,7 +768,7 @@ namespace verbly {
        } else if (wordByWnidAndWnum_.count(lookup1)) {
          leftJoin.push_back(wordByWnidAndWnum_.at(lookup1)->getId());
        }
-        
        if ((lookup2.second == 0) && (wordsByWnid_.count(lookup2.first)))
        {
          std::transform(std::begin(wordsByWnid_.at(lookup2.first)), std::end(wordsByWnid_.at(lookup2.first)), std::back_inserter(rightJoin), [] (word* w) {
@@ -766,7 +777,7 @@ namespace verbly {
        } else if (wordByWnidAndWnum_.count(lookup2)) {
          rightJoin.push_back(wordByWnidAndWnum_.at(lookup2)->getId());
        }
-        
        for (int word1 : leftJoin)
        {
          for (int word2 : rightJoin)
@@ -774,13 +785,13 @@ namespace verbly {
            std::list<field> fields;
            fields.emplace_back("term_id", word1);
            fields.emplace_back("domain_id", word2);
-          
            db_.insertIntoTable(table_name, std::move(fields));
          }
        }
      }
    }
-    
    void generator::readWordNetCausality()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_cs.pl"));
@@ -788,7 +799,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^cs\\((2\\d{8}),(2\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -798,21 +809,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("effect_id", notion1.getId());
          fields.emplace_back("cause_id", notion2.getId());
-          
          db_.insertIntoTable("causality", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetEntailment()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_ent.pl"));
@@ -820,7 +831,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^ent\\((2\\d{8}),(2\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -830,21 +841,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("given_id", notion1.getId());
          fields.emplace_back("entailment_id", notion2.getId());
-          
          db_.insertIntoTable("entailment", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetHypernymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_hyp.pl"));
@@ -852,7 +863,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^hyp\\(([12]\\d{8}),([12]\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -862,21 +873,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("hyponym_id", notion1.getId());
          fields.emplace_back("hypernym_id", notion2.getId());
-          
          db_.insertIntoTable("hypernymy", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetInstantiation()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_ins.pl"));
@@ -884,7 +895,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^ins\\((1\\d{8}),(1\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -894,21 +905,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("instance_id", notion1.getId());
          fields.emplace_back("class_id", notion2.getId());
-          
          db_.insertIntoTable("instantiation", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetMemberMeronymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_mm.pl"));
@@ -916,7 +927,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^mm\\((1\\d{8}),(1\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -926,21 +937,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("holonym_id", notion1.getId());
          fields.emplace_back("meronym_id", notion2.getId());
-          
          db_.insertIntoTable("member_meronymy", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetPartMeronymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_mp.pl"));
@@ -948,7 +959,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^mp\\((1\\d{8}),(1\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -958,21 +969,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("holonym_id", notion1.getId());
          fields.emplace_back("meronym_id", notion2.getId());
-          
          db_.insertIntoTable("part_meronymy", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetSubstanceMeronymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_ms.pl"));
@@ -980,7 +991,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^ms\\((1\\d{8}),(1\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -990,21 +1001,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("holonym_id", notion1.getId());
          fields.emplace_back("meronym_id", notion2.getId());
-          
          db_.insertIntoTable("substance_meronymy", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetPertainymy()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_per.pl"));
@@ -1012,7 +1023,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^per\\(([34]\\d{8}),(\\d+),([13]\\d{8}),(\\d+)\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -1022,31 +1033,31 @@ namespace verbly {
        std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
        std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
-        
        if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
        {
          word& word1 = *wordByWnidAndWnum_.at(lookup1);
          word& word2 = *wordByWnidAndWnum_.at(lookup2);
-          
          if (word1.getNotion().getPartOfSpeech() == part_of_speech::adjective)
          {
            std::list<field> fields;
            fields.emplace_back("pertainym_id", word1.getId());
            fields.emplace_back("noun_id", word2.getId());
-          
            db_.insertIntoTable("pertainymy", std::move(fields));
          } else if (word1.getNotion().getPartOfSpeech() == part_of_speech::adverb)
          {
            std::list<field> fields;
            fields.emplace_back("mannernym_id", word1.getId());
            fields.emplace_back("adjective_id", word2.getId());
-          
            db_.insertIntoTable("mannernymy", std::move(fields));
          }
        }
      }
    }
-    
    void generator::readWordNetSpecification()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_sa.pl"));
@@ -1064,21 +1075,21 @@ namespace verbly {
        std::pair<int, int> lookup1(std::stoi(relation_data[1]), std::stoi(relation_data[2]));
        std::pair<int, int> lookup2(std::stoi(relation_data[3]), std::stoi(relation_data[4]));
-        
        if (wordByWnidAndWnum_.count(lookup1) && wordByWnidAndWnum_.count(lookup2))
        {
          word& word1 = *wordByWnidAndWnum_.at(lookup1);
          word& word2 = *wordByWnidAndWnum_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("general_id", word1.getId());
          fields.emplace_back("specific_id", word2.getId());
-          
          db_.insertIntoTable("specification", std::move(fields));
        }
      }
    }
-    
    void generator::readWordNetSimilarity()
    {
      std::list<std::string> lines(readFile(wordNetPath_ + "wn_sim.pl"));
@@ -1086,7 +1097,7 @@ namespace verbly {
      for (auto line : lines)
      {
        ppgs.update();
-      
        std::regex relation("^sim\\((3\\d{8}),(3\\d{8})\\)\\.");
        std::smatch relation_data;
        if (!std::regex_search(line, relation_data, relation))
@@ -1096,21 +1107,21 @@ namespace verbly {
        int lookup1 = std::stoi(relation_data[1]);
        int lookup2 = std::stoi(relation_data[2]);
-        
        if (notionByWnid_.count(lookup1) && notionByWnid_.count(lookup2))
        {
          notion& notion1 = *notionByWnid_.at(lookup1);
          notion& notion2 = *notionByWnid_.at(lookup2);
-          
          std::list<field> fields;
          fields.emplace_back("adjective_1_id", notion1.getId());
          fields.emplace_back("adjective_2_id", notion2.getId());
-          
          db_.insertIntoTable("similarity", std::move(fields));
        }
      }
    }
-    
    std::list<std::string> generator::readFile(std::string path)
    {
      std::ifstream file(path);
@@ -1118,7 +1129,7 @@ namespace verbly {
      {
        throw std::invalid_argument("Could not find file " + path);
      }
-  
      std::list<std::string> lines;
      std::string line;
      while (std::getline(file, line))
@@ -1127,13 +1138,13 @@ namespace verbly {
        {
          line.pop_back();
        }
-      
        lines.push_back(line);
      }
-      
      return lines;
    }
-    
    part_of_speech generator::partOfSpeechByWnid(int wnid)
    {
      switch (wnid / 100000000)
@@ -1145,14 +1156,14 @@ namespace verbly {
        default: throw std::domain_error("Invalid WordNet synset ID: " + std::to_string(wnid));
      }
    }
-    
    notion& generator::createNotion(part_of_speech partOfSpeech)
    {
      notions_.emplace_back(partOfSpeech);
-      
      return notions_.back();
    }
-    
    notion& generator::lookupOrCreateNotion(int wnid)
    {
      if (!notionByWnid_.count(wnid))
@@ -1160,10 +1171,10 @@ namespace verbly {
        notions_.emplace_back(partOfSpeechByWnid(wnid), wnid);
        notionByWnid_[wnid] = &notions_.back();
      }
-      
      return *notionByWnid_.at(wnid);
    }
-    
    lemma& generator::lookupOrCreateLemma(std::string base_form)
    {
      if (!lemmaByBaseForm_.count(base_form))
@@ -1171,10 +1182,10 @@ namespace verbly {
        lemmas_.emplace_back(lookupOrCreateForm(base_form));
        lemmaByBaseForm_[base_form] = &lemmas_.back();
      }
-      
      return *lemmaByBaseForm_.at(base_form);
    }
-    
    form& generator::lookupOrCreateForm(std::string text)
    {
      if (!formByText_.count(text))
@@ -1182,32 +1193,32 @@ namespace verbly {
        forms_.emplace_back(text);
        formByText_[text] = &forms_.back();
      }
-      
      return *formByText_[text];
    }
-    
    template <typename... Args> word& generator::createWord(Args&&... args)
    {
      words_.emplace_back(std::forward<Args>(args)...);
      word& w = words_.back();
-      
      wordsByBaseForm_[w.getLemma().getBaseForm().getText()].insert(&w);
-      
      if (w.getNotion().hasWnid())
      {
        wordsByWnid_[w.getNotion().getWnid()].insert(&w);
      }
-      
      return w;
    }
-    
    group& generator::createGroup(xmlNodePtr top)
    {
      groups_.emplace_back();
      group& grp = groups_.back();
-      
      xmlChar* key;
-  
      for (xmlNodePtr node = top->xmlChildrenNode; node != nullptr; node = node->next)
      {
        if (!xmlStrcmp(node->name, reinterpret_cast<const xmlChar*>("SUBCLASSES")))
@@ -1223,14 +1234,14 @@ namespace verbly {
              } catch (const std::exception& e)
              {
                key = xmlGetProp(subclass, reinterpret_cast<const xmlChar*>("ID"));
-                
                if (key == nullptr)
                {
                  std::throw_with_nested(std::logic_error("Error parsing IDless subgroup"));
                } else {
                  std::string subgroupId(reinterpret_cast<const char*>(key));
                  xmlFree(key);
-                  
                  std::throw_with_nested(std::logic_error("Error parsing subgroup " + subgroupId));
                }
              }
@@ -1245,25 +1256,25 @@ namespace verbly {
              key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("wn"));
              std::string wnSenses(reinterpret_cast<const char*>(key));
              xmlFree(key);
-              
              auto wnSenseKeys = split<std::list<std::string>>(wnSenses, " ");
              if (!wnSenseKeys.empty())
              {
                std::list<std::string> tempKeys;
-                
                std::transform(std::begin(wnSenseKeys), std::end(wnSenseKeys), std::back_inserter(tempKeys), [] (std::string sense) {
                  return sense + "::";
                });
-                
                std::list<std::string> filteredKeys;
-                
                std::remove_copy_if(std::begin(tempKeys), std::end(tempKeys), std::back_inserter(filteredKeys), [&] (std::string sense) {
                  return !wnSenseKeys_.count(sense);
                });
-                
                wnSenseKeys = std::move(filteredKeys);
              }
-              
              if (!wnSenseKeys.empty())
              {
                for (std::string sense : wnSenseKeys)
@@ -1275,11 +1286,11 @@ namespace verbly {
                key = xmlGetProp(member, reinterpret_cast<const xmlChar*>("name"));
                std::string memberName(reinterpret_cast<const char*>(key));
                xmlFree(key);
-                
                notion& n = createNotion(part_of_speech::verb);
                lemma& l = lookupOrCreateLemma(memberName);
                word& w = createWord(n, l);
-                
                w.setVerbGroup(grp);
              }
            }
@@ -1293,7 +1304,7 @@ namespace verbly {
              key = xmlGetProp(roletopnode, reinterpret_cast<const xmlChar*>("type"));
              std::string roleName = reinterpret_cast<const char*>(key);
              xmlFree(key);
-              
              selrestr roleSelrestrs;
              for (xmlNodePtr rolenode = roletopnode->xmlChildrenNode; rolenode != nullptr; rolenode = rolenode->next)
              {
@@ -1314,19 +1325,19 @@ namespace verbly {
            {
              frames_.emplace_back();
              frame& fr = frames_.back();
-          
              for (xmlNodePtr framenode = frametopnode->xmlChildrenNode; framenode != nullptr; framenode = framenode->next)
              {
                if (!xmlStrcmp(framenode->name, reinterpret_cast<const xmlChar*>("SYNTAX")))
                {
                  for (xmlNodePtr syntaxnode = framenode->xmlChildrenNode; syntaxnode != nullptr; syntaxnode = syntaxnode->next)
-                  {                
+                  {
                    if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("NP")))
                    {
                      key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
                      std::string partRole = reinterpret_cast<const char*>(key);
                      xmlFree(key);
-                  
                      selrestr partSelrestrs;
                      std::set<std::string> partSynrestrs;
@@ -1344,13 +1355,13 @@ namespace verbly {
                            }
                          }
                        }
-                  
                        if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
                        {
                          partSelrestrs = parseSelrestr(npnode);
                        }
                      }
-                      
                      fr.push_back(part::createNounPhrase(std::move(partRole), std::move(partSelrestrs), std::move(partSynrestrs)));
                    } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("VERB")))
                    {
@@ -1359,11 +1370,11 @@ namespace verbly {
                    {
                      std::set<std::string> partChoices;
                      bool partLiteral;
-                      
                      if (xmlHasProp(syntaxnode, reinterpret_cast<const xmlChar*>("value")))
                      {
                        partLiteral = true;
-                        
                        key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
                        std::string choicesStr = reinterpret_cast<const char*>(key);
                        xmlFree(key);
@@ -1380,7 +1391,7 @@ namespace verbly {
                        }
                      } else {
                        partLiteral = false;
-                        
                        for (xmlNodePtr npnode = syntaxnode->xmlChildrenNode; npnode != nullptr; npnode = npnode->next)
                        {
                          if (!xmlStrcmp(npnode->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
@@ -1397,7 +1408,7 @@ namespace verbly {
                          }
                        }
                      }
-                  
                      fr.push_back(part::createPreposition(std::move(partChoices), partLiteral));
                    } else if (!xmlStrcmp(syntaxnode->name, reinterpret_cast<const xmlChar*>("ADJ")))
                    {
@@ -1410,7 +1421,7 @@ namespace verbly {
                      key = xmlGetProp(syntaxnode, reinterpret_cast<const xmlChar*>("value"));
                      std::string literalValue = reinterpret_cast<const char*>(key);
                      xmlFree(key);
-                      
                      fr.push_back(part::createLiteral(literalValue));
                    } else {
                      continue;
@@ -1427,11 +1438,11 @@ namespace verbly {
      return grp;
    }
-    
    selrestr generator::parseSelrestr(xmlNodePtr top)
    {
      xmlChar* key;
-  
      if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTRS")))
      {
        if (xmlChildElementCount(top) == 0)
@@ -1449,10 +1460,10 @@ namespace verbly {
            {
              orlogic = true;
            }
-            
            xmlFree(key);
          }
-  
          std::list<selrestr> children;
          for (xmlNodePtr selrestr = top->xmlChildrenNode; selrestr != nullptr; selrestr = selrestr->next)
          {
@@ -1462,7 +1473,7 @@ namespace verbly {
              children.push_back(parseSelrestr(selrestr));
            }
          }
-          
          return selrestr(children, orlogic);
        }
      } else if (!xmlStrcmp(top->name, reinterpret_cast<const xmlChar*>("SELRESTR")))
@@ -1474,12 +1485,12 @@ namespace verbly {
        key = xmlGetProp(top, reinterpret_cast<const xmlChar*>("type"));
        std::string selRestriction = reinterpret_cast<const char*>(key);
        xmlFree(key);
-        
        return selrestr(selRestriction, selPos);
      } else {
        throw std::logic_error("Badly formatted selrestr");
      }
    }
-    
  };
 };
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2017-01-22 18:43:25 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2017-01-22 18:43:25 -0500
commit	2c5ed155e8951f7f28b82ed7570295d5629f4770 (patch)
tree	889b807b6e76ec745f19488be7700cfc574aa9fa /generator/generator.cpp
parent	8837d3ceb8358476f20d71948a1fc9bc91480d57 (diff)
download	verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.tar.gz verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.tar.bz2 verbly-2c5ed155e8951f7f28b82ed7570295d5629f4770.zip