about summary refs log tree commit diff stats
path: root/kgramstats.h
blob: ff2fc6617ada53c08380ba35dcb0a362d85e1ce1 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#include <string>
#include <map>
#include <list>
#include <vector>
#include "malaprop.h"

#ifndef KGRAMSTATS_H
#define KGRAMSTATS_H

enum tokentype {
  tokentype_literal,
  tokentype_hashtag
};

struct token {
  tokentype type;
  std::string canon;
  bool terminating;
  
  token(std::string canon) : type(tokentype_literal), canon(canon), terminating(false) {}
  token(tokentype type) : type(type), canon(""), terminating(false) {}
  
  bool operator<(const token& other) const
  {
    if (type != other.type)
    {
      return type < other.type;
    } else if (type == tokentype_literal)
    {
      if (canon == other.canon)
      {
        return !terminating && other.terminating;
      } else {
        return canon < other.canon;
      }
    } else {
      return !terminating && other.terminating;
    }
  }
};

enum querytype {
  querytype_literal,
  querytype_sentence
};

struct query {
  querytype type;
  token word;
  
  query(token word) : word(word), type(querytype_literal) {}
  
  query(querytype type) : word(""), type(type) {}
  
  bool operator<(const query& other) const
  {
    if (type == other.type)
    {
      return word < other.word;
    } else {
      return type < other.type;
    }
  }
};

typedef std::list<query> kgram;

struct termstats {
  char terminator;
  int occurrences;
  
  termstats() : terminator('.'), occurrences(1) {}
  
  termstats(char terminator, int occurrences)
  {
    this->terminator = terminator;
    this->occurrences = occurrences;
  }
  
  bool operator<(const termstats& other) const
  {
    if (terminator == other.terminator)
    {
      return occurrences < other.occurrences;
    } else {
      return terminator < other.terminator;
    }
  }
};

class kgramstats
{
public:
	kgramstats(std::string corpus, int maxK);
	std::vector<std::string> randomSentence(int n);
	
private:
	struct token_data
	{
		int all;
		int titlecase;
		int uppercase;
    token word;
    
    token_data() : word(""), all(0), titlecase(0), uppercase(0) {}
	};
  
	int maxK;
	std::map<kgram, std::map<int, token_data> > stats;
  malaprop mstats;
  std::map<token, std::map<int, termstats> > endings;
  std::vector<std::string> hashtags;
};

void printKgram(kgram k);

#endif