about summary refs log tree commit diff stats
path: root/kgramstats.h
blob: ee75ada27970ee4255875d320e5dbb128d56f937 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#ifndef KGRAMSTATS_H
#define KGRAMSTATS_H

#include <string>
#include <map>
#include <list>
#include <vector>
#include "histogram.h"
#include <functional>

class rawr {
  public:
    typedef std::function<std::string(std::string, std::string)> transform_callback;
    
    void addCorpus(std::string corpus);
    void compile(int maxK);
    
    void setTransformCallback(transform_callback _arg);
  	std::string randomSentence(int maxL);
	
  private:
    struct word {
      std::string canon;
      histogram<std::string> forms;
      histogram<std::string> terms;
  
      word(std::string canon) : canon(canon) {}
  
      bool operator<(const word& other) const
      {
        return canon < other.canon;
      }
    };

    enum class suffixtype {
      none,
      terminating,
      comma
    };

    enum class parentype {
      paren,
      square_bracket,
      asterisk,
      quote
    };

    enum class doublestatus {
      opening,
      closing,
      both
    };

    struct delimiter {
      parentype type;
      doublestatus status;
  
      delimiter(parentype type, doublestatus status) : type(type), status(status) {}
  
      bool operator<(const delimiter& other) const
      {
        return std::tie(type, status) < std::tie(other.type, other.status);
      }
    };

    struct token {
      const word& w;
      std::map<delimiter, int> delimiters;
      suffixtype suffix;
      std::string raw;
    
      token(const word& w) : w(w), suffix(suffixtype::none) {}
  
      bool operator<(const token& other) const
      {
        return std::tie(w, delimiters, suffix) < std::tie(other.w, other.delimiters, other.suffix);
      }
    };

    enum class querytype {
      literal,
      sentence
    };

    struct query {
      querytype type;
      token tok;
  
      query(token tok) : tok(tok), type(querytype::literal) {}
  
      query(querytype type) : tok(blank_word), type(type) {}
  
      bool operator<(const query& other) const
      {
        if (type == other.type)
        {
          return tok < other.tok;
        } else {
          return type < other.type;
        }
      }
    };
    
    static const query wildcardQuery;
    static const word blank_word;

    typedef std::list<query> kgram;
    
  	struct token_data
  	{
  		int all;
  		int titlecase;
  		int uppercase;
      token tok;
    
      token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {}
  	};
    
    friend std::ostream& operator<<(std::ostream& os, kgram k);
    friend std::ostream& operator<<(std::ostream& os, query q);
    friend std::ostream& operator<<(std::ostream& os, token t);
  
  	int _maxK;
    bool _compiled = false; 
    std::vector<std::string> _corpora;
  	std::map<kgram, std::map<int, token_data>> _stats;
    transform_callback _transform;
  
    // Words
    std::map<std::string, word> words;
    word hashtags {"#hashtag"};
    word emoticons {"👌"};
};

#endif