Added emoji freevar

Strings of emojis are tokenized separately from anything else, and added to an emoticon freevar, which is mixed in with regular emoticons like :P. This breaks old-style freevars like $name$ and $noun$ so some legacy support for compatibility is left in but eventually $name$ should be made into an actual new freevar. Emoji data is from gemoji (https://github.com/github/gemoji).
author: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-02-01 09:30:04 -0500
committer: Kelly Rauchenberger <fefferburbia@gmail.com> 2016-02-01 09:30:04 -0500
commit: 617155fe562652c859a380d85cc5710783d79448 (patch)
tree: f5eee89b0fa4b3c9dfe7187ca78916a71b59045e /kgramstats.cpp
parent: b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (diff)
download: rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.gz
rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.bz2
rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.zip
1 files changed, 102 insertions, 3 deletions
diff --git a/kgramstats.cpp b/kgramstats.cpp
index 0ab0c99..5b571d6 100644
--- a/kgramstats.cpp
+++ b/kgramstats.cpp

@@ -39,6 +39,9 @@
 #include <algorithm>
 #include <set>
 #include <stack>
+#include "freevars.h"
+#include <fstream>
+#include "prefix_search.h"
 query wildcardQuery {querytype::sentence};
 word blank_word {""};
@@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  size_t start = 0;
  int end = 0;
  std::set<std::string> thashtags;
+  freevar fv_emoticons {emoticons, "emoticons.txt"};
+  
+  std::cout << "Reading emojis..." << std::endl;
+  prefix_search emojis;
+  std::ifstream emoji_file("emojis.txt");
+  if (emoji_file)
+  {
+    while (!emoji_file.eof())
+    {
+      std::string rawmojis;
+      getline(emoji_file, rawmojis);
+      emojis.add(rawmojis);
+    }
+    
+    emoji_file.close();
+  }
+  std::cout << "Tokenizing corpus..." << std::endl;
  while (end != std::string::npos)
  {
    end = corpus.find(" ", start);
-    std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    bool emoji = false;
-    if (t.compare("") && t.compare("."))
+    std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
+    std::string t = "";
+    
+    if (te.compare("") && te.compare("."))
    {
+      // Extract strings of emojis into their own tokens even if they're not space delimited
+      int m = emojis.match(te);
+      emoji = m > 0;
+      if (m == 0) m = 1;
+      t = te.substr(0,m);
+      te = te.substr(m);
+      
+      while (!te.empty())
+      {
+        m = emojis.match(te);
+        if (emoji == (m > 0))
+        {
+          if (m == 0) m = 1;
+          t += te.substr(0,m);
+          te = te.substr(m);
+        } else {
+          end = start + t.length() - 1;
+          break;
+        }
+      }
+      
      std::string tc(t), canonical;
      std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
      std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
@@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK)
        if (canonical[0] == '#')
        {
          thashtags.insert(canonical);
-          canonical = "#hashtag";
          
          return hashtags;
        }
        
+        // Emoticon freevar
+        if (emoji)
+        {
+          emoticons.forms.add(canonical);
+          
+          return emoticons;
+        }
+        
+        std::string emoticon_canon;
+        std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) {
+          return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
+        });
+        if (fv_emoticons.check(emoticon_canon))
+        {
+          emoticons.forms.add(emoticon_canon);
+          
+          return emoticons;
+        }
+        
        // Basically any other word
        if (words.count(canonical) == 0)
        {
@@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
  
  // Time to condense the distribution stuff for the words
+  std::cout << "Compiling token histograms..." << std::endl;
  for (auto& it : words)
  {
    it.second.forms.compile();
@@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  
  hashtags.forms.compile();
  hashtags.terms.compile();
+  
+  // Compile other freevars
+  emoticons.forms.compile();
+  emoticons.terms.compile();
  // kgram distribution
+  std::cout << "Creating markov chain..." << std::endl;
  std::map<kgram, std::map<token, token_data> > tstats;
  for (int k=1; k<maxK; k++)
  {
@@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
  }
        
  // Condense the kgram distribution
+  std::cout << "Compiling kgram distributions..." << std::endl;
  for (auto& it : tstats)
  {
    kgram klist = it.first;
@@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n)
    
    open_delimiters.pop();
  }
+  
+  // Replace old-style freevars while I can't be bothered to remake the corpus yet
+  std::vector<std::string> fv_names;
+  std::ifstream namefile("names.txt");
+  while (!namefile.eof())
+  {
+    std::string l;
+    getline(namefile, l);
+    fv_names.push_back(l);
+  }
+  
+  int cpos;
+  while ((cpos = result.find("$name$")) != std::string::npos)
+  {
+    result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
+  }
+  
+  std::vector<std::string> fv_nouns;
+  std::ifstream nounfile("nouns.txt");
+  while (!nounfile.eof())
+  {
+    std::string l;
+    getline(nounfile, l);
+    fv_nouns.push_back(l);
+  }
+  
+  while ((cpos = result.find("$noun$")) != std::string::npos)
+  {
+    result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]);
+  }
        
  return result;
 }
author	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-02-01 09:30:04 -0500
committer	Kelly Rauchenberger <fefferburbia@gmail.com>	2016-02-01 09:30:04 -0500
commit	617155fe562652c859a380d85cc5710783d79448 (patch)
tree	f5eee89b0fa4b3c9dfe7187ca78916a71b59045e /kgramstats.cpp
parent	b316e309559d7176af6cf0bb7dcd6dbaa83c01cd (diff)
download	rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.gz rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.tar.bz2 rawr-ebooks-617155fe562652c859a380d85cc5710783d79448.zip

diff --git a/kgramstats.cpp b/kgramstats.cpp index 0ab0c99..5b571d6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -39,6 +39,9 @@
39	#include <algorithm>	39	#include <algorithm>
40	#include <set>	40	#include <set>
41	#include <stack>	41	#include <stack>
		42	#include "freevars.h"
		43	#include <fstream>
		44	#include "prefix_search.h"
42		45
43	query wildcardQuery {querytype::sentence};	46	query wildcardQuery {querytype::sentence};
44	word blank_word {""};	47	word blank_word {""};
@@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK)
53	size_t start = 0;	56	size_t start = 0;
54	int end = 0;	57	int end = 0;
55	std::set<std::string> thashtags;	58	std::set<std::string> thashtags;
		59	freevar fv_emoticons {emoticons, "emoticons.txt"};
		60
		61	std::cout << "Reading emojis..." << std::endl;
		62	prefix_search emojis;
		63	std::ifstream emoji_file("emojis.txt");
		64	if (emoji_file)
		65	{
		66	while (!emoji_file.eof())
		67	{
		68	std::string rawmojis;
		69	getline(emoji_file, rawmojis);
		70	emojis.add(rawmojis);
		71	}
		72
		73	emoji_file.close();
		74	}
56		75
		76	std::cout << "Tokenizing corpus..." << std::endl;
57	while (end != std::string::npos)	77	while (end != std::string::npos)
58	{	78	{
59	end = corpus.find(" ", start);	79	end = corpus.find(" ", start);
60		80
61	std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);	81	bool emoji = false;
62	if (t.compare("") && t.compare("."))	82	std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
		83	std::string t = "";
		84
		85	if (te.compare("") && te.compare("."))
63	{	86	{
		87	// Extract strings of emojis into their own tokens even if they're not space delimited
		88	int m = emojis.match(te);
		89	emoji = m > 0;
		90	if (m == 0) m = 1;
		91	t = te.substr(0,m);
		92	te = te.substr(m);
		93
		94	while (!te.empty())
		95	{
		96	m = emojis.match(te);
		97	if (emoji == (m > 0))
		98	{
		99	if (m == 0) m = 1;
		100	t += te.substr(0,m);
		101	te = te.substr(m);
		102	} else {
		103	end = start + t.length() - 1;
		104	break;
		105	}
		106	}
		107
64	std::string tc(t), canonical;	108	std::string tc(t), canonical;
65	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);	109	std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66	std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {	110	std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
@@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK)
72	if (canonical[0] == '#')	116	if (canonical[0] == '#')
73	{	117	{
74	thashtags.insert(canonical);	118	thashtags.insert(canonical);
75	canonical = "#hashtag";
76		119
77	return hashtags;	120	return hashtags;
78	}	121	}
79		122
		123	// Emoticon freevar
		124	if (emoji)
		125	{
		126	emoticons.forms.add(canonical);
		127
		128	return emoticons;
		129	}
		130
		131	std::string emoticon_canon;
		132	std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) {
		133	return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
		134	});
		135	if (fv_emoticons.check(emoticon_canon))
		136	{
		137	emoticons.forms.add(emoticon_canon);
		138
		139	return emoticons;
		140	}
		141
80	// Basically any other word	142	// Basically any other word
81	if (words.count(canonical) == 0)	143	if (words.count(canonical) == 0)
82	{	144	{
@@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171	}	233	}
172		234
173	// Time to condense the distribution stuff for the words	235	// Time to condense the distribution stuff for the words
		236	std::cout << "Compiling token histograms..." << std::endl;
174	for (auto& it : words)	237	for (auto& it : words)
175	{	238	{
176	it.second.forms.compile();	239	it.second.forms.compile();
@@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
185		248
186	hashtags.forms.compile();	249	hashtags.forms.compile();
187	hashtags.terms.compile();	250	hashtags.terms.compile();
		251
		252	// Compile other freevars
		253	emoticons.forms.compile();
		254	emoticons.terms.compile();
188		255
189	// kgram distribution	256	// kgram distribution
		257	std::cout << "Creating markov chain..." << std::endl;
190	std::map<kgram, std::map<token, token_data> > tstats;	258	std::map<kgram, std::map<token, token_data> > tstats;
191	for (int k=1; k<maxK; k++)	259	for (int k=1; k<maxK; k++)
192	{	260	{
@@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
246	}	314	}
247		315
248	// Condense the kgram distribution	316	// Condense the kgram distribution
		317	std::cout << "Compiling kgram distributions..." << std::endl;
249	for (auto& it : tstats)	318	for (auto& it : tstats)
250	{	319	{
251	kgram klist = it.first;	320	kgram klist = it.first;
@@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n)
454		523
455	open_delimiters.pop();	524	open_delimiters.pop();
456	}	525	}
		526
		527	// Replace old-style freevars while I can't be bothered to remake the corpus yet
		528	std::vector<std::string> fv_names;
		529	std::ifstream namefile("names.txt");
		530	while (!namefile.eof())
		531	{
		532	std::string l;
		533	getline(namefile, l);
		534	fv_names.push_back(l);
		535	}
		536
		537	int cpos;
		538	while ((cpos = result.find("$name$")) != std::string::npos)
		539	{
		540	result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
		541	}
		542
		543	std::vector<std::string> fv_nouns;
		544	std::ifstream nounfile("nouns.txt");
		545	while (!nounfile.eof())
		546	{
		547	std::string l;
		548	getline(nounfile, l);
		549	fv_nouns.push_back(l);
		550	}
		551
		552	while ((cpos = result.find("$noun$")) != std::string::npos)
		553	{
		554	result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]);
		555	}
457		556
458	return result;	557	return result;
459	}	558	}