diff options
Diffstat (limited to 'imagenet.cpp')
-rw-r--r-- | imagenet.cpp | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..3a107bd --- /dev/null +++ b/imagenet.cpp | |||
@@ -0,0 +1,89 @@ | |||
1 | #include "imagenet.h" | ||
2 | #include <fstream> | ||
3 | #include <stdexcept> | ||
4 | #include <sstream> | ||
5 | #include <vector> | ||
6 | #include <curl_easy.h> | ||
7 | #include <curl_header.h> | ||
8 | |||
9 | imagenet::imagenet(std::string path) : path_(path) {} | ||
10 | |||
11 | std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const | ||
12 | { | ||
13 | std::filesystem::path filename = path_ / std::to_string(notion_id); | ||
14 | if (!std::filesystem::exists(filename)) | ||
15 | { | ||
16 | throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename)); | ||
17 | } | ||
18 | |||
19 | std::ifstream file(filename); | ||
20 | std::string line; | ||
21 | std::vector<std::string> urls; | ||
22 | while (std::getline(file, line)) | ||
23 | { | ||
24 | if (!line.empty()) | ||
25 | { | ||
26 | urls.push_back(line); | ||
27 | } | ||
28 | } | ||
29 | |||
30 | std::string output; | ||
31 | std::string extension; | ||
32 | while (!urls.empty()) | ||
33 | { | ||
34 | int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng); | ||
35 | std::string url = urls.at(index); | ||
36 | urls.erase(std::begin(urls) + index); | ||
37 | |||
38 | // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists | ||
39 | if (url.find("willyfogg.com/thumb.php") != std::string::npos) | ||
40 | { | ||
41 | continue; | ||
42 | } | ||
43 | |||
44 | // Accept string from Google Chrome | ||
45 | std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; | ||
46 | curl::curl_header headers; | ||
47 | headers.add(accept); | ||
48 | |||
49 | std::ostringstream imgbuf; | ||
50 | curl::curl_ios<std::ostringstream> imgios(imgbuf); | ||
51 | curl::curl_easy imghandle(imgios); | ||
52 | |||
53 | imghandle.add<CURLOPT_HTTPHEADER>(headers.get()); | ||
54 | imghandle.add<CURLOPT_URL>(url.c_str()); | ||
55 | imghandle.add<CURLOPT_CONNECTTIMEOUT>(30); | ||
56 | imghandle.add<CURLOPT_TIMEOUT>(300); | ||
57 | |||
58 | try | ||
59 | { | ||
60 | imghandle.perform(); | ||
61 | } catch (const curl::curl_easy_exception& error) { | ||
62 | error.print_traceback(); | ||
63 | |||
64 | continue; | ||
65 | } | ||
66 | |||
67 | if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200) | ||
68 | { | ||
69 | continue; | ||
70 | } | ||
71 | |||
72 | std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get(); | ||
73 | if (content_type.substr(0, 6) != "image/") | ||
74 | { | ||
75 | continue; | ||
76 | } | ||
77 | |||
78 | output = imgbuf.str(); | ||
79 | extension = url.substr(url.rfind(".") + 1); | ||
80 | break; | ||
81 | } | ||
82 | |||
83 | if (output.empty()) | ||
84 | { | ||
85 | throw std::invalid_argument(std::string("No valid urls found for ") + std::string(filename)); | ||
86 | } | ||
87 | |||
88 | return {output, extension}; | ||
89 | } | ||