diff options
Diffstat (limited to 'imagenet.cpp')
| -rw-r--r-- | imagenet.cpp | 89 |
1 files changed, 89 insertions, 0 deletions
| diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..3a107bd --- /dev/null +++ b/imagenet.cpp | |||
| @@ -0,0 +1,89 @@ | |||
| 1 | #include "imagenet.h" | ||
| 2 | #include <fstream> | ||
| 3 | #include <stdexcept> | ||
| 4 | #include <sstream> | ||
| 5 | #include <vector> | ||
| 6 | #include <curl_easy.h> | ||
| 7 | #include <curl_header.h> | ||
| 8 | |||
| 9 | imagenet::imagenet(std::string path) : path_(path) {} | ||
| 10 | |||
| 11 | std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const | ||
| 12 | { | ||
| 13 | std::filesystem::path filename = path_ / std::to_string(notion_id); | ||
| 14 | if (!std::filesystem::exists(filename)) | ||
| 15 | { | ||
| 16 | throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename)); | ||
| 17 | } | ||
| 18 | |||
| 19 | std::ifstream file(filename); | ||
| 20 | std::string line; | ||
| 21 | std::vector<std::string> urls; | ||
| 22 | while (std::getline(file, line)) | ||
| 23 | { | ||
| 24 | if (!line.empty()) | ||
| 25 | { | ||
| 26 | urls.push_back(line); | ||
| 27 | } | ||
| 28 | } | ||
| 29 | |||
| 30 | std::string output; | ||
| 31 | std::string extension; | ||
| 32 | while (!urls.empty()) | ||
| 33 | { | ||
| 34 | int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng); | ||
| 35 | std::string url = urls.at(index); | ||
| 36 | urls.erase(std::begin(urls) + index); | ||
| 37 | |||
| 38 | // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists | ||
| 39 | if (url.find("willyfogg.com/thumb.php") != std::string::npos) | ||
| 40 | { | ||
| 41 | continue; | ||
| 42 | } | ||
| 43 | |||
| 44 | // Accept string from Google Chrome | ||
| 45 | std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; | ||
| 46 | curl::curl_header headers; | ||
| 47 | headers.add(accept); | ||
| 48 | |||
| 49 | std::ostringstream imgbuf; | ||
| 50 | curl::curl_ios<std::ostringstream> imgios(imgbuf); | ||
| 51 | curl::curl_easy imghandle(imgios); | ||
| 52 | |||
| 53 | imghandle.add<CURLOPT_HTTPHEADER>(headers.get()); | ||
| 54 | imghandle.add<CURLOPT_URL>(url.c_str()); | ||
| 55 | imghandle.add<CURLOPT_CONNECTTIMEOUT>(30); | ||
| 56 | imghandle.add<CURLOPT_TIMEOUT>(300); | ||
| 57 | |||
| 58 | try | ||
| 59 | { | ||
| 60 | imghandle.perform(); | ||
| 61 | } catch (const curl::curl_easy_exception& error) { | ||
| 62 | error.print_traceback(); | ||
| 63 | |||
| 64 | continue; | ||
| 65 | } | ||
| 66 | |||
| 67 | if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200) | ||
| 68 | { | ||
| 69 | continue; | ||
| 70 | } | ||
| 71 | |||
| 72 | std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get(); | ||
| 73 | if (content_type.substr(0, 6) != "image/") | ||
| 74 | { | ||
| 75 | continue; | ||
| 76 | } | ||
| 77 | |||
| 78 | output = imgbuf.str(); | ||
| 79 | extension = url.substr(url.rfind(".") + 1); | ||
| 80 | break; | ||
| 81 | } | ||
| 82 | |||
| 83 | if (output.empty()) | ||
| 84 | { | ||
| 85 | throw std::invalid_argument(std::string("No valid urls found for ") + std::string(filename)); | ||
| 86 | } | ||
| 87 | |||
| 88 | return {output, extension}; | ||
| 89 | } | ||
