about summary refs log tree commit diff stats
path: root/data/maps/the_wondrous/doors.txtpb
blob: 39516046dae48929d6948832d65a04f45246e80b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
#ifndef KGRAMSTATS_H
#define KGRAMSTATS_H

#include <string>
#include <map>
#include <list>
#include <vector>
#include "histogram.h"
#include <functional>
#include <set>

class rawr {
  public:
    typedef std::function<std::string(std::string, std::string)> transform_callback;
    
    void addCorpus(std::string corpus);
    void compile(int maxK);
    
    void setTransformCallback(transform_callback _arg);
    void setMinCorpora(int _arg);
  	std::string randomSentence(int maxL) const;
	
  private:
    struct terminator {
      std::string form;
      bool newline = false;
      
      terminator(std::string form, bool newline) : form(form), newline(newline) {}
      
      bool operator<(const terminator& other) const
      {
        return std::tie(form, newline) < std::tie(other.form, other.newline);
      }
    };
    
    struct word {
      std::string canon;
      histogram<std::string> forms;
      histogram<terminator> terms;
  
      word(std::string canon) : canon(canon) {}
  
      bool operator<(const word& other) const
      {
        return canon < other.canon;
      }
    };

    enum class suffixtype {
      none,
      terminating,
      comma
    };

    enum class parentype {
      paren,
      square_bracket,
      asterisk,
      quote
    };

    enum class doublestatus {
      opening,
      closing,
      both
    };

    struct delimiter {
      parentype type;
      doublestatus status;
  
      delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
      bool operator<(const delimiter& other) const
      {
        return std::tie(type, status) < std::tie(other.type, other.status);
      }
    };

    struct token {
      const word& w;
      std::map<delimiter, int> delimiters;
      suffixtype suffix;
      std::string raw;
      bool newline = false;
    
      token(const word& w) : w(w), suffix(suffixtype::none) {}
  
      bool operator<(const token& other) const
      {
        return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
      }
    };

    enum class querytype {
      literal,
      sentence
    };

    struct query {
      querytype type;
      token tok;
  
      query(token tok) : tok(tok), type(querytype::literal) {}
  
      query(querytype type) : tok(blank_word), type(type) {}
  
      bool operator<(const query& other) const
      {
        if (type == other.type)
        {
          return tok < other.tok;
        } else {
          return type < other.type;
        }
      }
    };
    
    static const query wildcardQuery;
    static const word blank_word;

    typedef std::list<query> kgram;
    
  	struct token_data
  	{
  		int all;
  		int titlecase;
  		int uppercase;
      token tok;
      std::set<int> corpora;
    
      token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
  	};
    
    friend std::ostream& operator<<(std::ostream& os, kgram k);
    friend std::ostream& operator<<(std::ostream& os, query q);
    friend std::ostream& operator<<(std::ostream& os, token t);
    friend std::ostream& operator<<(std::ostream& os, terminator t);
  
  	int _maxK;
    bool _compiled = false; 
    std::vector<std::string> _corpora;
  	std::map<kgram, std::map<int, token_data>> _stats;
    transform_callback _transform;
    int _min_corpora = 1;
  
    // Words
    std::map<std::string, word> words;
    word hashtags {"#hashtag"};
    word emoticons {"👌"};
};

#endif