about summary refs log tree commit diff stats
diff options
context:
space:
mode:
-rw-r--r--CMakeLists.txt4
-rw-r--r--ebooks.cpp8
-rw-r--r--emojis.txt845
-rw-r--r--emoticons.txt6
-rw-r--r--freevars.cpp58
-rw-r--r--freevars.h21
-rw-r--r--gen.cpp8
-rw-r--r--histogram.cpp10
-rw-r--r--histogram.h1
-rw-r--r--kgramstats.cpp105
-rw-r--r--kgramstats.h5
-rw-r--r--prefix_search.cpp35
-rw-r--r--prefix_search.h23
13 files changed, 1064 insertions, 65 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 41c4552..a76cc2a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ find_package(curl)
8if (YamlCpp_FOUND AND CURL_FOUND) 8if (YamlCpp_FOUND AND CURL_FOUND)
9 add_subdirectory(vendor/twitcurl/libtwitcurl) 9 add_subdirectory(vendor/twitcurl/libtwitcurl)
10 include_directories(vendor/twitcurl/libtwitcurl) 10 include_directories(vendor/twitcurl/libtwitcurl)
11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp) 11 add_executable(rawr-ebooks ebooks.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11) 12 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD 11)
13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON) 13 set_property(TARGET rawr-ebooks PROPERTY CXX_STANDARD_REQUIRED ON)
14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES}) 14 target_link_libraries(rawr-ebooks ${YamlCpp_LIBRARIES} twitcurl ${CURL_LIBRARIES})
@@ -16,6 +16,6 @@ else (YamlCpp_FOUND AND CURL_FOUND)
16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen") 16 message(STATUS "rawr-ebooks requires yaml-cpp and twitcurl; without these, we will only make rawr-gen")
17endif (YamlCpp_FOUND AND CURL_FOUND) 17endif (YamlCpp_FOUND AND CURL_FOUND)
18 18
19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp) 19add_executable(rawr-gen gen.cpp kgramstats.cpp freevars.cpp histogram.cpp prefix_search.cpp)
20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11) 20set_property(TARGET rawr-gen PROPERTY CXX_STANDARD 11)
21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON) 21set_property(TARGET rawr-gen PROPERTY CXX_STANDARD_REQUIRED ON)
diff --git a/ebooks.cpp b/ebooks.cpp index ed1e080..fdbeeab 100644 --- a/ebooks.cpp +++ b/ebooks.cpp
@@ -10,7 +10,6 @@
10#include <twitcurl.h> 10#include <twitcurl.h>
11#include <unistd.h> 11#include <unistd.h>
12#include <yaml-cpp/yaml.h> 12#include <yaml-cpp/yaml.h>
13#include "freevars.h"
14 13
15int main(int argc, char** args) 14int main(int argc, char** args)
16{ 15{
@@ -29,11 +28,6 @@ int main(int argc, char** args)
29 28
30 std::cout << "Preprocessing corpus..." << std::endl; 29 std::cout << "Preprocessing corpus..." << std::endl;
31 kgramstats* stats = new kgramstats(corpus, 4); 30 kgramstats* stats = new kgramstats(corpus, 4);
32
33 std::cout << "Preprocessing freevars..." << std::endl;
34 freevars* vars = new freevars();
35 vars->addVar("name", "names.txt");
36 vars->addVar("noun", "nouns.txt");
37 31
38 twitCurl twitter; 32 twitCurl twitter;
39 twitter.getOAuth().setConsumerKey(config["consumer_key"].as<std::string>()); 33 twitter.getOAuth().setConsumerKey(config["consumer_key"].as<std::string>());
@@ -45,7 +39,7 @@ int main(int argc, char** args)
45 for (;;) 39 for (;;)
46 { 40 {
47 std::string doc = stats->randomSentence(rand() % 45 + 5); 41 std::string doc = stats->randomSentence(rand() % 45 + 5);
48 std::string hi = vars->parse(doc); 42 std::string hi = doc;
49 hi.resize(140); 43 hi.resize(140);
50 44
51 std::string replyMsg; 45 std::string replyMsg;
diff --git a/emojis.txt b/emojis.txt new file mode 100644 index 0000000..fc120e0 --- /dev/null +++ b/emojis.txt
@@ -0,0 +1,845 @@
1😄
2😃
3😀
4😊
5☺️
6😉
7😍
8😘
9😚
10😗
11😙
12😜
13😝
14😛
15😳
16😁
17😔
18😌
19😒
20😞
21😣
22😢
23😂
24😭
25😪
26😥
27😰
28😅
29😓
30😩
31😫
32😨
33😱
34😠
35😡
36😤
37😖
38😆
39😋
40😷
41😎
42😴
43😵
44😲
45😟
46😦
47😧
48😈
49👿
50😮
51😬
52😐
53😕
54😯
55😶
56😇
57😏
58😑
59👲
60👳
61👮
62👷
63💂
64👶
65👦
66👧
67👨
68👩
69👴
70👵
71👱
72👼
73👸
74😺
75😸
76😻
77😽
78😼
79🙀
80😿
81😹
82😾
83👹
84👺
85🙈
86🙉
87🙊
88💀
89👽
90💩
91🔥
92
93🌟
94💫
95💥
96💢
97💦
98💧
99💤
100💨
101👂
102👀
103👃
104👅
105👄
106👍
107👎
108👌
109👊
110
111✌️
112👋
113
114👐
115👆
116👇
117👉
118👈
119🙌
120🙏
121☝️
122👏
123💪
124🚶
125🏃
126💃
127👫
128👪
129👬
130👭
131💏
132💑
133👯
134🙆
135🙅
136💁
137🙋
138💆
139💇
140💅
141👰
142🙎
143🙍
144🙇
145🎩
146👑
147👒
148👟
149👞
150👡
151👠
152👢
153👕
154👔
155👚
156👗
157🎽
158👖
159👘
160👙
161💼
162👜
163👝
164👛
165👓
166🎀
167🌂
168💄
169💛
170💙
171💜
172💚
173❤️
174💔
175💗
176💓
177💕
178💖
179💞
180💘
181💌
182💋
183💍
184💎
185👤
186👥
187💬
188👣
189💭
190🐶
191🐺
192🐱
193🐭
194🐹
195🐰
196🐸
197🐯
198🐨
199🐻
200🐷
201🐽
202🐮
203🐗
204🐵
205🐒
206🐴
207🐑
208🐘
209🐼
210🐧
211🐦
212🐤
213🐥
214🐣
215🐔
216🐍
217🐢
218🐛
219🐝
220🐜
221🐞
222🐌
223🐙
224🐚
225🐠
226🐟
227🐬
228🐳
229🐋
230🐄
231🐏
232🐀
233🐃
234🐅
235🐇
236🐉
237🐎
238🐐
239🐓
240🐕
241🐖
242🐁
243🐂
244🐲
245🐡
246🐊
247🐫
248🐪
249🐆
250🐈
251🐩
252🐾
253💐
254🌸
255🌷
256🍀
257🌹
258🌻
259🌺
260🍁
261🍃
262🍂
263🌿
264🌾
265🍄
266🌵
267🌴
268🌲
269🌳
270🌰
271🌱
272🌼
273🌐
274🌞
275🌝
276🌚
277🌑
278🌒
279🌓
280🌔
281🌕
282🌖
283🌗
284🌘
285🌜
286🌛
287🌙
288🌍
289🌎
290🌏
291🌋
292🌌
293🌠
294
295☀️
296
297☁️
298
299
300❄️
301
302🌀
303🌁
304🌈
305🌊
306🎍
307💝
308🎎
309🎒
310🎓
311🎏
312🎆
313🎇
314🎐
315🎑
316🎃
317👻
318🎅
319🎄
320🎁
321🎋
322🎉
323🎊
324🎈
325🎌
326🔮
327🎥
328📷
329📹
330📼
331💿
332📀
333💽
334💾
335💻
336📱
337☎️
338📞
339📟
340📠
341📡
342📺
343📻
344🔊
345🔉
346🔈
347🔇
348🔔
349🔕
350📢
351📣
352
353
354
355
356🔓
357🔒
358🔏
359🔐
360🔑
361🔎
362💡
363🔦
364🔆
365🔅
366🔌
367🔋
368🔍
369🛁
370🛀
371🚿
372🚽
373🔧
374🔩
375🔨
376🚪
377🚬
378💣
379🔫
380🔪
381💊
382💉
383💰
384💴
385💵
386💷
387💶
388💳
389💸
390📲
391📧
392📥
393📤
394✉️
395📩
396📨
397📯
398📫
399📪
400📬
401📭
402📮
403📦
404📝
405📄
406📃
407📑
408📊
409📈
410📉
411📜
412📋
413📅
414📆
415📇
416📁
417📂
418✂️
419📌
420📎
421✒️
422✏️
423📏
424📐
425📕
426📗
427📘
428📙
429📓
430📔
431📒
432📚
433📖
434🔖
435📛
436🔬
437🔭
438📰
439🎨
440🎬
441🎤
442🎧
443🎼
444🎵
445🎶
446🎹
447🎻
448🎺
449🎷
450🎸
451👾
452🎮
453🃏
454🎴
455🀄
456🎲
457🎯
458🏈
459🏀
460
461⚾️
462🎾
463🎱
464🏉
465🎳
466
467🚵
468🚴
469🏁
470🏇
471🏆
472🎿
473🏂
474🏊
475🏄
476🎣
477
478🍵
479🍶
480🍼
481🍺
482🍻
483🍸
484🍹
485🍷
486🍴
487🍕
488🍔
489🍟
490🍗
491🍖
492🍝
493🍛
494🍤
495🍱
496🍣
497🍥
498🍙
499🍘
500🍚
501🍜
502🍲
503🍢
504🍡
505🍳
506🍞
507🍩
508🍮
509🍦
510🍨
511🍧
512🎂
513🍰
514🍪
515🍫
516🍬
517🍭
518🍯
519🍎
520🍏
521🍊
522🍋
523🍒
524🍇
525🍉
526🍓
527🍑
528🍈
529🍌
530🍐
531🍍
532🍠
533🍆
534🍅
535🌽
536🏠
537🏡
538🏫
539🏢
540🏣
541🏥
542🏦
543🏪
544🏩
545🏨
546💒
547
548🏬
549🏤
550🌇
551🌆
552🏯
553🏰
554
555🏭
556🗼
557🗾
558🗻
559🌄
560🌅
561🌃
562🗽
563🌉
564🎠
565🎡
566
567🎢
568🚢
569
570🚤
571🚣
572
573🚀
574✈️
575💺
576🚁
577🚂
578🚊
579🚉
580🚞
581🚆
582🚄
583🚅
584🚈
585🚇
586🚝
587🚋
588🚃
589🚎
590🚌
591🚍
592🚙
593🚘
594🚗
595🚕
596🚖
597🚛
598🚚
599🚨
600🚓
601🚔
602🚒
603🚑
604🚐
605🚲
606🚡
607🚟
608🚠
609🚜
610💈
611🚏
612🎫
613🚦
614🚥
615⚠️
616🚧
617🔰
618
619🏮
620🎰
621♨️
622🗿
623🎪
624🎭
625📍
626🚩
627🇯🇵
628🇰🇷
629🇩🇪
630🇨🇳
631🇺🇸
632🇫🇷
633🇪🇸
634🇮🇹
635🇷🇺
636🇬🇧
6371️⃣
6382️⃣
6393️⃣
6404️⃣
6415️⃣
6426️⃣
6437️⃣
6448️⃣
6459️⃣
6460️⃣
647🔟
648🔢
649#️⃣
650🔣
651⬆️
652⬇️
653⬅️
654➡️
655🔠
656🔡
657🔤
658↗️
659↖️
660↘️
661↙️
662↔️
663↕️
664🔄
665◀️
666▶️
667🔼
668🔽
669↩️
670↪️
671ℹ️
672
673
674
675
676⤵️
677⤴️
678🆗
679🔀
680🔁
681🔂
682🆕
683🆙
684🆒
685🆓
686🆖
687📶
688🎦
689🈁
690🈯
691🈳
692🈵
693🈴
694🈲
695🉐
696🈹
697🈺
698🈶
699🈚
700🚻
701🚹
702🚺
703🚼
704🚾
705🚰
706🚮
707🅿️
708
709🚭
710🈷️
711🈸
712🈂️
713Ⓜ️
714🛂
715🛄
716🛅
717🛃
718🉑
719㊙️
720㊗️
721🆑
722🆘
723🆔
724🚫
725🔞
726📵
727🚯
728🚱
729🚳
730🚷
731🚸
732
733✳️
734❇️
735
736
737✴️
738💟
739🆚
740📳
741📴
742🅰️
743🅱️
744🆎
745🅾️
746💠
747
748♻️
749
750
751
752
753
754
755
756
757
758
759
760
761
762🔯
763🏧
764💹
765💲
766💱
767©️
768®️
769™️
770
771‼️
772⁉️
773
774
775
776
777
778🔝
779🔚
780🔙
781🔛
782🔜
783🔃
784🕛
785🕧
786🕐
787🕜
788🕑
789🕝
790🕒
791🕞
792🕓
793🕟
794🕔
795🕠
796🕕
797🕖
798🕗
799🕘
800🕙
801🕚
802🕡
803🕢
804🕣
805🕤
806🕥
807🕦
808✖️
809
810
811
812♠️
813♥️
814♣️
815♦️
816💮
817💯
818✔️
819☑️
820🔘
821🔗
822
823〰️
824〽️
825🔱
826◼️
827◻️
828
829
830▪️
831▫️
832🔺
833🔲
834🔳
835
836
837🔴
838🔵
839🔻
840
841
842🔶
843🔷
844🔸
845🔹 \ No newline at end of file
diff --git a/emoticons.txt b/emoticons.txt new file mode 100644 index 0000000..21b8990 --- /dev/null +++ b/emoticons.txt
@@ -0,0 +1,6 @@
1:)
2:p
3o_o
4:d
5;)
6:o \ No newline at end of file
diff --git a/freevars.cpp b/freevars.cpp index 54c5aab..4429d00 100644 --- a/freevars.cpp +++ b/freevars.cpp
@@ -1,46 +1,32 @@
1#include "freevars.h" 1#include "freevars.h"
2#include <fstream> 2#include <fstream>
3#include <cstdlib> 3#include "kgramstats.h"
4 4
5freevars::freevars() 5freevar::freevar(word& w, std::string file) : w(w)
6{ 6{
7 vars = new std::map<std::string, std::vector<std::string>* >(); 7 std::ifstream infile(file);
8 if (infile)
9 {
10 std::string line;
11 while (getline(infile, line))
12 {
13 instances.insert(line);
14 w.forms.add(line);
15 }
16 }
8} 17}
9 18
10void freevars::addVar(std::string name, std::string filename) 19bool freevar::check(std::string f) const
11{ 20{
12 std::vector<std::string>* eltlist = new std::vector<std::string>(); 21 return (instances.count(f) == 1);
13
14 std::ifstream infile(filename.c_str());
15 if (infile)
16 {
17 std::string line;
18
19 while (getline(infile, line))
20 {
21 eltlist->push_back(line);
22 }
23 } else {
24 eltlist->push_back("");
25 }
26
27 (*vars)[name] = eltlist;
28} 22}
29 23
30std::string freevars::parse(std::string in) 24void freevar::add(std::string f)
31{ 25{
32 std::string res(in); 26 instances.insert(f);
33 27}
34 for (std::map<std::string, std::vector<std::string>* >::iterator it = vars->begin(); it != vars->end(); it++) 28
35 { 29word& freevar::getWord()
36 std::string tofind = "$" + it->first + "$"; 30{
37 size_t fpos; 31 return w;
38 while ((fpos = res.find(tofind)) != std::string::npos) 32}
39 {
40 int r = rand() % it->second->size();
41 res.replace(fpos, tofind.length(), (*it->second)[r], 0, std::string::npos);
42 }
43 }
44
45 return res;
46} \ No newline at end of file
diff --git a/freevars.h b/freevars.h index c92b9f5..f800220 100644 --- a/freevars.h +++ b/freevars.h
@@ -1,19 +1,22 @@
1#include <map>
2#include <string> 1#include <string>
3#include <vector> 2#include <set>
4 3
5#ifndef FREEVARS_H 4#ifndef FREEVARS_H
6#define FREEVARS_H 5#define FREEVARS_H
7 6
8class freevars 7class word;
8
9class freevar
9{ 10{
10public: 11 public:
11 freevars(); 12 freevar(word& w, std::string file);
12 void addVar(std::string name, std::string filename); 13 bool check(std::string f) const;
13 std::string parse(std::string in); 14 void add(std::string f);
15 word& getWord();
14 16
15private: 17 private:
16 std::map<std::string, std::vector<std::string>* >* vars; 18 word& w;
19 std::set<std::string> instances;
17}; 20};
18 21
19#endif \ No newline at end of file 22#endif \ No newline at end of file
diff --git a/gen.cpp b/gen.cpp index a0ef8e3..26edd21 100644 --- a/gen.cpp +++ b/gen.cpp
@@ -7,7 +7,6 @@
7#include <cstdlib> 7#include <cstdlib>
8#include <fstream> 8#include <fstream>
9#include <iostream> 9#include <iostream>
10#include "freevars.h"
11 10
12int main(int argc, char** args) 11int main(int argc, char** args)
13{ 12{
@@ -44,16 +43,11 @@ int main(int argc, char** args)
44 std::cout << "Preprocessing corpus..." << std::endl; 43 std::cout << "Preprocessing corpus..." << std::endl;
45 kgramstats* stats = new kgramstats(corpus, 4); 44 kgramstats* stats = new kgramstats(corpus, 4);
46 45
47 std::cout << "Preprocessing freevars..." << std::endl;
48 freevars* vars = new freevars();
49 vars->addVar("name", "names.txt");
50 vars->addVar("noun", "nouns.txt");
51
52 std::cout << "Generating..." << std::endl; 46 std::cout << "Generating..." << std::endl;
53 for (;;) 47 for (;;)
54 { 48 {
55 std::string doc = stats->randomSentence(rand() % 35 + 15); 49 std::string doc = stats->randomSentence(rand() % 35 + 15);
56 std::string hi = vars->parse(doc); 50 std::string hi = doc;
57 hi.resize(140); 51 hi.resize(140);
58 52
59 std::cout << hi << std::endl; 53 std::cout << hi << std::endl;
diff --git a/histogram.cpp b/histogram.cpp index 6896146..6d31cf4 100644 --- a/histogram.cpp +++ b/histogram.cpp
@@ -1,5 +1,6 @@
1#include "histogram.h" 1#include "histogram.h"
2#include <cstdlib> 2#include <cstdlib>
3#include <iostream>
3 4
4template <class T> 5template <class T>
5void histogram<T>::add(const T& inst) 6void histogram<T>::add(const T& inst)
@@ -31,4 +32,13 @@ const T& histogram<T>::next() const
31 return distribution.upper_bound(r)->second; 32 return distribution.upper_bound(r)->second;
32} 33}
33 34
35template <class T>
36void histogram<T>::print() const
37{
38 for (auto& freqpair : freqtable)
39 {
40 std::cout << freqpair.first << ": " << freqpair.second << std::endl;
41 }
42}
43
34template class histogram <std::string>; 44template class histogram <std::string>;
diff --git a/histogram.h b/histogram.h index 5aa2560..76d8f1b 100644 --- a/histogram.h +++ b/histogram.h
@@ -10,6 +10,7 @@ class histogram {
10 void add(const T& inst); 10 void add(const T& inst);
11 void compile(); 11 void compile();
12 const T& next() const; 12 const T& next() const;
13 void print() const;
13 14
14 private: 15 private:
15 std::map<T, int> freqtable; 16 std::map<T, int> freqtable;
diff --git a/kgramstats.cpp b/kgramstats.cpp index 0ab0c99..5b571d6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp
@@ -39,6 +39,9 @@
39#include <algorithm> 39#include <algorithm>
40#include <set> 40#include <set>
41#include <stack> 41#include <stack>
42#include "freevars.h"
43#include <fstream>
44#include "prefix_search.h"
42 45
43query wildcardQuery {querytype::sentence}; 46query wildcardQuery {querytype::sentence};
44word blank_word {""}; 47word blank_word {""};
@@ -53,14 +56,55 @@ kgramstats::kgramstats(std::string corpus, int maxK)
53 size_t start = 0; 56 size_t start = 0;
54 int end = 0; 57 int end = 0;
55 std::set<std::string> thashtags; 58 std::set<std::string> thashtags;
59 freevar fv_emoticons {emoticons, "emoticons.txt"};
60
61 std::cout << "Reading emojis..." << std::endl;
62 prefix_search emojis;
63 std::ifstream emoji_file("emojis.txt");
64 if (emoji_file)
65 {
66 while (!emoji_file.eof())
67 {
68 std::string rawmojis;
69 getline(emoji_file, rawmojis);
70 emojis.add(rawmojis);
71 }
72
73 emoji_file.close();
74 }
56 75
76 std::cout << "Tokenizing corpus..." << std::endl;
57 while (end != std::string::npos) 77 while (end != std::string::npos)
58 { 78 {
59 end = corpus.find(" ", start); 79 end = corpus.find(" ", start);
60 80
61 std::string t = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start); 81 bool emoji = false;
62 if (t.compare("") && t.compare(".")) 82 std::string te = corpus.substr(start, (end == std::string::npos) ? std::string::npos : end - start);
83 std::string t = "";
84
85 if (te.compare("") && te.compare("."))
63 { 86 {
87 // Extract strings of emojis into their own tokens even if they're not space delimited
88 int m = emojis.match(te);
89 emoji = m > 0;
90 if (m == 0) m = 1;
91 t = te.substr(0,m);
92 te = te.substr(m);
93
94 while (!te.empty())
95 {
96 m = emojis.match(te);
97 if (emoji == (m > 0))
98 {
99 if (m == 0) m = 1;
100 t += te.substr(0,m);
101 te = te.substr(m);
102 } else {
103 end = start + t.length() - 1;
104 break;
105 }
106 }
107
64 std::string tc(t), canonical; 108 std::string tc(t), canonical;
65 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower); 109 std::transform(tc.begin(), tc.end(), tc.begin(), ::tolower);
66 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) { 110 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(canonical), [=] (char c) {
@@ -72,11 +116,29 @@ kgramstats::kgramstats(std::string corpus, int maxK)
72 if (canonical[0] == '#') 116 if (canonical[0] == '#')
73 { 117 {
74 thashtags.insert(canonical); 118 thashtags.insert(canonical);
75 canonical = "#hashtag";
76 119
77 return hashtags; 120 return hashtags;
78 } 121 }
79 122
123 // Emoticon freevar
124 if (emoji)
125 {
126 emoticons.forms.add(canonical);
127
128 return emoticons;
129 }
130
131 std::string emoticon_canon;
132 std::remove_copy_if(tc.begin(), tc.end(), std::back_inserter(emoticon_canon), [=] (char c) {
133 return !((c != '.') && (c != '?') && (c != '!') && (c != ',') && (c != '"') && (c != '\n') && (c != '[') && (c != ']') && (c != '*'));
134 });
135 if (fv_emoticons.check(emoticon_canon))
136 {
137 emoticons.forms.add(emoticon_canon);
138
139 return emoticons;
140 }
141
80 // Basically any other word 142 // Basically any other word
81 if (words.count(canonical) == 0) 143 if (words.count(canonical) == 0)
82 { 144 {
@@ -171,6 +233,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
171 } 233 }
172 234
173 // Time to condense the distribution stuff for the words 235 // Time to condense the distribution stuff for the words
236 std::cout << "Compiling token histograms..." << std::endl;
174 for (auto& it : words) 237 for (auto& it : words)
175 { 238 {
176 it.second.forms.compile(); 239 it.second.forms.compile();
@@ -185,8 +248,13 @@ kgramstats::kgramstats(std::string corpus, int maxK)
185 248
186 hashtags.forms.compile(); 249 hashtags.forms.compile();
187 hashtags.terms.compile(); 250 hashtags.terms.compile();
251
252 // Compile other freevars
253 emoticons.forms.compile();
254 emoticons.terms.compile();
188 255
189 // kgram distribution 256 // kgram distribution
257 std::cout << "Creating markov chain..." << std::endl;
190 std::map<kgram, std::map<token, token_data> > tstats; 258 std::map<kgram, std::map<token, token_data> > tstats;
191 for (int k=1; k<maxK; k++) 259 for (int k=1; k<maxK; k++)
192 { 260 {
@@ -246,6 +314,7 @@ kgramstats::kgramstats(std::string corpus, int maxK)
246 } 314 }
247 315
248 // Condense the kgram distribution 316 // Condense the kgram distribution
317 std::cout << "Compiling kgram distributions..." << std::endl;
249 for (auto& it : tstats) 318 for (auto& it : tstats)
250 { 319 {
251 kgram klist = it.first; 320 kgram klist = it.first;
@@ -454,6 +523,36 @@ std::string kgramstats::randomSentence(int n)
454 523
455 open_delimiters.pop(); 524 open_delimiters.pop();
456 } 525 }
526
527 // Replace old-style freevars while I can't be bothered to remake the corpus yet
528 std::vector<std::string> fv_names;
529 std::ifstream namefile("names.txt");
530 while (!namefile.eof())
531 {
532 std::string l;
533 getline(namefile, l);
534 fv_names.push_back(l);
535 }
536
537 int cpos;
538 while ((cpos = result.find("$name$")) != std::string::npos)
539 {
540 result.replace(cpos, 6, fv_names[rand() % fv_names.size()]);
541 }
542
543 std::vector<std::string> fv_nouns;
544 std::ifstream nounfile("nouns.txt");
545 while (!nounfile.eof())
546 {
547 std::string l;
548 getline(nounfile, l);
549 fv_nouns.push_back(l);
550 }
551
552 while ((cpos = result.find("$noun$")) != std::string::npos)
553 {
554 result.replace(cpos, 6, fv_nouns[rand() % fv_nouns.size()]);
555 }
457 556
458 return result; 557 return result;
459} 558}
diff --git a/kgramstats.h b/kgramstats.h index a97d7bf..4acde65 100644 --- a/kgramstats.h +++ b/kgramstats.h
@@ -112,8 +112,11 @@ private:
112 112
113 int maxK; 113 int maxK;
114 std::map<kgram, std::map<int, token_data> > stats; 114 std::map<kgram, std::map<int, token_data> > stats;
115 word hashtags {"#hashtag"}; 115
116 // Words
116 std::map<std::string, word> words; 117 std::map<std::string, word> words;
118 word hashtags {"#hashtag"};
119 word emoticons {"👌"};
117}; 120};
118 121
119void printKgram(kgram k); 122void printKgram(kgram k);
diff --git a/prefix_search.cpp b/prefix_search.cpp new file mode 100644 index 0000000..2603061 --- /dev/null +++ b/prefix_search.cpp
@@ -0,0 +1,35 @@
1#include "prefix_search.h"
2
3void prefix_search::add(std::string prefix)
4{
5 node* cur = &top;
6 for (int c : prefix)
7 {
8 cur = &cur->children[c];
9 }
10
11 cur->match = true;
12}
13
14int prefix_search::match(std::string in) const
15{
16 int ret = 0;
17 const node* cur = &top;
18 for (int c : in)
19 {
20 if (cur->children.count(c) == 0)
21 {
22 return 0;
23 }
24
25 cur = &cur->children.at(c);
26 ret++;
27
28 if (cur->match)
29 {
30 return ret;
31 }
32 }
33
34 return 0;
35}
diff --git a/prefix_search.h b/prefix_search.h new file mode 100644 index 0000000..dd2f535 --- /dev/null +++ b/prefix_search.h
@@ -0,0 +1,23 @@
1#ifndef PREFIX_SEARCH_H_5CFCF783
2#define PREFIX_SEARCH_H_5CFCF783
3
4#include <map>
5#include <string>
6
7class prefix_search {
8 public:
9 void add(std::string prefix);
10 int match(std::string in) const;
11
12 private:
13 struct node {
14 std::map<int, struct node> children;
15 bool match;
16
17 node() : match(false) {}
18 };
19
20 node top;
21};
22
23#endif /* end of include guard: PREFIX_SEARCH_H_5CFCF783 */