diff options
| -rw-r--r-- | kgramstats.cpp | 134 | ||||
| -rw-r--r-- | kgramstats.h | 4 |
2 files changed, 91 insertions, 47 deletions
| diff --git a/kgramstats.cpp b/kgramstats.cpp index e0c2eac..cb63db6 100644 --- a/kgramstats.cpp +++ b/kgramstats.cpp | |||
| @@ -56,8 +56,7 @@ void rawr::compile(int maxK) | |||
| 56 | { | 56 | { |
| 57 | _maxK = maxK; | 57 | _maxK = maxK; |
| 58 | 58 | ||
| 59 | std::vector<token> tokens; | 59 | std::vector<std::vector<token>> tokens; |
| 60 | size_t start = 0; | ||
| 61 | std::set<std::string> thashtags; | 60 | std::set<std::string> thashtags; |
| 62 | std::set<std::string> fv_emoticons; | 61 | std::set<std::string> fv_emoticons; |
| 63 | 62 | ||
| @@ -119,7 +118,9 @@ void rawr::compile(int maxK) | |||
| 119 | std::cout.fill(' '); | 118 | std::cout.fill(' '); |
| 120 | for (int i = 0; i < _corpora.size(); i++) | 119 | for (int i = 0; i < _corpora.size(); i++) |
| 121 | { | 120 | { |
| 121 | size_t start = 0; | ||
| 122 | int end = 0; | 122 | int end = 0; |
| 123 | std::vector<token> tkcor; | ||
| 123 | 124 | ||
| 124 | while (end != std::string::npos) | 125 | while (end != std::string::npos) |
| 125 | { | 126 | { |
| @@ -331,12 +332,14 @@ void rawr::compile(int maxK) | |||
| 331 | } | 332 | } |
| 332 | } | 333 | } |
| 333 | 334 | ||
| 334 | tokens.push_back(tk); | 335 | tkcor.push_back(tk); |
| 335 | } | 336 | } |
| 336 | 337 | ||
| 337 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); | 338 | start = ((end > (std::string::npos - 1) ) ? std::string::npos : end + 1); |
| 338 | } | 339 | } |
| 339 | 340 | ||
| 341 | tokens.push_back(tkcor); | ||
| 342 | |||
| 340 | startper += _corpora[i].length(); | 343 | startper += _corpora[i].length(); |
| 341 | } | 344 | } |
| 342 | 345 | ||
| @@ -372,65 +375,82 @@ void rawr::compile(int maxK) | |||
| 372 | // kgram distribution | 375 | // kgram distribution |
| 373 | std::cout << "Creating markov chain... 0%" << std::flush; | 376 | std::cout << "Creating markov chain... 0%" << std::flush; |
| 374 | std::map<kgram, std::map<token, token_data> > tstats; | 377 | std::map<kgram, std::map<token, token_data> > tstats; |
| 375 | len = (maxK-1) * tokens.size(); | 378 | |
| 379 | len = 0; | ||
| 380 | for (auto c : tokens) | ||
| 381 | { | ||
| 382 | len += (maxK-1) * c.size(); | ||
| 383 | } | ||
| 384 | |||
| 385 | startper = 0; | ||
| 376 | per = 0; | 386 | per = 0; |
| 377 | perprime = 0; | 387 | perprime = 0; |
| 378 | for (int k=1; k<maxK; k++) | 388 | int corpid = 0; |
| 389 | for (auto corpus : tokens) | ||
| 379 | { | 390 | { |
| 380 | for (int i=0; i<(tokens.size() - k); i++) | 391 | for (int k=1; k<maxK; k++) |
| 381 | { | 392 | { |
| 382 | perprime = (((k-1)*tokens.size())+i) * 100 / len; | 393 | for (int i=0; i<(corpus.size() - k); i++) |
| 383 | if (perprime != per) | ||
| 384 | { | 394 | { |
| 385 | per = perprime; | 395 | perprime = (startper+i) * 100 / len; |
| 396 | if (perprime != per) | ||
| 397 | { | ||
| 398 | per = perprime; | ||
| 386 | 399 | ||
| 387 | std::cout << "\b\b\b\b" << std::right; | 400 | std::cout << "\b\b\b\b" << std::right; |
| 388 | std::cout.width(3); | 401 | std::cout.width(3); |
| 389 | std::cout << per << "%" << std::flush; | 402 | std::cout << per << "%" << std::flush; |
| 390 | } | 403 | } |
| 391 | 404 | ||
| 392 | kgram prefix(tokens.begin()+i, tokens.begin()+i+k); | 405 | kgram prefix(corpus.begin()+i, corpus.begin()+i+k); |
| 393 | token f = tokens[i+k]; | 406 | token f = corpus[i+k]; |
| 394 | 407 | ||
| 395 | if (tstats[prefix].count(f) == 0) | 408 | if (tstats[prefix].count(f) == 0) |
| 396 | { | ||
| 397 | tstats[prefix].emplace(f, f); | ||
| 398 | } | ||
| 399 | |||
| 400 | token_data& td = tstats[prefix].at(f); | ||
| 401 | td.all++; | ||
| 402 | |||
| 403 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
| 404 | { | ||
| 405 | td.uppercase++; | ||
| 406 | } else if (isupper(f.raw[0])) | ||
| 407 | { | ||
| 408 | td.titlecase++; | ||
| 409 | } | ||
| 410 | |||
| 411 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | ||
| 412 | { | ||
| 413 | kgram term_prefix(prefix); | ||
| 414 | term_prefix.pop_front(); | ||
| 415 | term_prefix.push_front(wildcardQuery); | ||
| 416 | |||
| 417 | if (tstats[term_prefix].count(f) == 0) | ||
| 418 | { | 409 | { |
| 419 | tstats[term_prefix].emplace(f, f); | 410 | tstats[prefix].emplace(f, f); |
| 420 | } | 411 | } |
| 421 | 412 | ||
| 422 | token_data& td2 = tstats[term_prefix].at(f); | 413 | token_data& td = tstats[prefix].at(f); |
| 423 | td2.all++; | 414 | td.all++; |
| 415 | td.corpora.insert(corpid); | ||
| 424 | 416 | ||
| 425 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | 417 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) |
| 426 | { | 418 | { |
| 427 | td2.uppercase++; | 419 | td.uppercase++; |
| 428 | } else if (isupper(f.raw[0])) | 420 | } else if (isupper(f.raw[0])) |
| 429 | { | 421 | { |
| 430 | td2.titlecase++; | 422 | td.titlecase++; |
| 423 | } | ||
| 424 | |||
| 425 | if (std::begin(prefix)->tok.suffix == suffixtype::terminating) | ||
| 426 | { | ||
| 427 | kgram term_prefix(prefix); | ||
| 428 | term_prefix.pop_front(); | ||
| 429 | term_prefix.push_front(wildcardQuery); | ||
| 430 | |||
| 431 | if (tstats[term_prefix].count(f) == 0) | ||
| 432 | { | ||
| 433 | tstats[term_prefix].emplace(f, f); | ||
| 434 | } | ||
| 435 | |||
| 436 | token_data& td2 = tstats[term_prefix].at(f); | ||
| 437 | td2.all++; | ||
| 438 | td2.corpora.insert(corpid); | ||
| 439 | |||
| 440 | if (std::find_if(f.raw.begin(), f.raw.end(), ::islower) == f.raw.end()) | ||
| 441 | { | ||
| 442 | td2.uppercase++; | ||
| 443 | } else if (isupper(f.raw[0])) | ||
| 444 | { | ||
| 445 | td2.titlecase++; | ||
| 446 | } | ||
| 431 | } | 447 | } |
| 432 | } | 448 | } |
| 449 | |||
| 450 | startper += corpus.size(); | ||
| 433 | } | 451 | } |
| 452 | |||
| 453 | corpid++; | ||
| 434 | } | 454 | } |
| 435 | 455 | ||
| 436 | std::cout << "\b\b\b\b100%" << std::endl; | 456 | std::cout << "\b\b\b\b100%" << std::endl; |
| @@ -527,6 +547,11 @@ void rawr::setTransformCallback(transform_callback _arg) | |||
| 527 | _transform = _arg; | 547 | _transform = _arg; |
| 528 | } | 548 | } |
| 529 | 549 | ||
| 550 | void rawr::setMinCorpora(int _arg) | ||
| 551 | { | ||
| 552 | _min_corpora = _arg; | ||
| 553 | } | ||
| 554 | |||
| 530 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus | 555 | // runs in O(n log t) time where n is the input number of sentences and t is the number of tokens in the input corpus |
| 531 | std::string rawr::randomSentence(int maxL) | 556 | std::string rawr::randomSentence(int maxL) |
| 532 | { | 557 | { |
| @@ -539,6 +564,7 @@ std::string rawr::randomSentence(int maxL) | |||
| 539 | kgram cur(1, wildcardQuery); | 564 | kgram cur(1, wildcardQuery); |
| 540 | int cuts = 0; | 565 | int cuts = 0; |
| 541 | std::stack<parentype> open_delimiters; | 566 | std::stack<parentype> open_delimiters; |
| 567 | std::set<int> used_corpora; | ||
| 542 | 568 | ||
| 543 | for (;;) | 569 | for (;;) |
| 544 | { | 570 | { |
| @@ -690,9 +716,19 @@ std::string rawr::randomSentence(int maxL) | |||
| 690 | { | 716 | { |
| 691 | cuts++; | 717 | cuts++; |
| 692 | } | 718 | } |
| 719 | |||
| 720 | if (next.corpora.size() == 1) | ||
| 721 | { | ||
| 722 | used_corpora.insert(*next.corpora.begin()); | ||
| 723 | } | ||
| 693 | 724 | ||
| 694 | /* DEBUG */ | 725 | /* DEBUG */ |
| 695 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << std::endl; | 726 | std::cout << cur << "-> \"" << nextToken << "\" (" << next.all << "/" << max << ")" << " in corp"; |
| 727 | for (auto cor : next.corpora) | ||
| 728 | { | ||
| 729 | std::cout << " " << cor; | ||
| 730 | } | ||
| 731 | std::cout << std::endl; | ||
| 696 | 732 | ||
| 697 | cur.push_back(next.tok); | 733 | cur.push_back(next.tok); |
| 698 | result.append(nextToken); | 734 | result.append(nextToken); |
| @@ -703,6 +739,12 @@ std::string rawr::randomSentence(int maxL) | |||
| 703 | } | 739 | } |
| 704 | } | 740 | } |
| 705 | 741 | ||
| 742 | // Ensure that enough corpora are used | ||
| 743 | if (used_corpora.size() < _min_corpora) | ||
| 744 | { | ||
| 745 | return randomSentence(maxL); | ||
| 746 | } | ||
| 747 | |||
| 706 | // Remove the trailing space | 748 | // Remove the trailing space |
| 707 | if (result.back() == ' ') | 749 | if (result.back() == ' ') |
| 708 | { | 750 | { |
| @@ -722,8 +764,6 @@ std::string rawr::randomSentence(int maxL) | |||
| 722 | 764 | ||
| 723 | open_delimiters.pop(); | 765 | open_delimiters.pop(); |
| 724 | } | 766 | } |
| 725 | |||
| 726 | result.resize(maxL); | ||
| 727 | 767 | ||
| 728 | return result; | 768 | return result; |
| 729 | } | 769 | } |
| diff --git a/kgramstats.h b/kgramstats.h index fc01101..d939ade 100644 --- a/kgramstats.h +++ b/kgramstats.h | |||
| @@ -7,6 +7,7 @@ | |||
| 7 | #include <vector> | 7 | #include <vector> |
| 8 | #include "histogram.h" | 8 | #include "histogram.h" |
| 9 | #include <functional> | 9 | #include <functional> |
| 10 | #include <set> | ||
| 10 | 11 | ||
| 11 | class rawr { | 12 | class rawr { |
| 12 | public: | 13 | public: |
| @@ -16,6 +17,7 @@ class rawr { | |||
| 16 | void compile(int maxK); | 17 | void compile(int maxK); |
| 17 | 18 | ||
| 18 | void setTransformCallback(transform_callback _arg); | 19 | void setTransformCallback(transform_callback _arg); |
| 20 | void setMinCorpora(int _arg); | ||
| 19 | std::string randomSentence(int maxL); | 21 | std::string randomSentence(int maxL); |
| 20 | 22 | ||
| 21 | private: | 23 | private: |
| @@ -125,6 +127,7 @@ class rawr { | |||
| 125 | int titlecase; | 127 | int titlecase; |
| 126 | int uppercase; | 128 | int uppercase; |
| 127 | token tok; | 129 | token tok; |
| 130 | std::set<int> corpora; | ||
| 128 | 131 | ||
| 129 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} | 132 | token_data(token tok) : tok(tok), all(0), titlecase(0), uppercase(0) {} |
| 130 | }; | 133 | }; |
| @@ -139,6 +142,7 @@ class rawr { | |||
| 139 | std::vector<std::string> _corpora; | 142 | std::vector<std::string> _corpora; |
| 140 | std::map<kgram, std::map<int, token_data>> _stats; | 143 | std::map<kgram, std::map<int, token_data>> _stats; |
| 141 | transform_callback _transform; | 144 | transform_callback _transform; |
| 145 | int _min_corpora = 1; | ||
| 142 | 146 | ||
| 143 | // Words | 147 | // Words |
| 144 | std::map<std::string, word> words; | 148 | std::map<std::string, word> words; |
