diff options
Diffstat (limited to 'imagenet.cpp')
-rw-r--r-- | imagenet.cpp | 93 |
1 files changed, 93 insertions, 0 deletions
diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..d5aeada --- /dev/null +++ b/imagenet.cpp | |||
@@ -0,0 +1,93 @@ | |||
1 | #include "imagenet.h" | ||
2 | #include <fstream> | ||
3 | #include <stdexcept> | ||
4 | #include <sstream> | ||
5 | #include <vector> | ||
6 | #include <curl_easy.h> | ||
7 | #include <curl_header.h> | ||
8 | |||
9 | imagenet::imagenet(std::string path) : path_(path) {} | ||
10 | |||
11 | std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const | ||
12 | { | ||
13 | auto result = getImagesForNotion(notion_id, rng, 1); | ||
14 | return result[0]; | ||
15 | } | ||
16 | |||
17 | std::vector<std::tuple<std::string, std::string>> imagenet::getImagesForNotion(int notion_id, std::mt19937& rng, int num) const | ||
18 | { | ||
19 | std::filesystem::path filename = path_ / std::to_string(notion_id); | ||
20 | if (!std::filesystem::exists(filename)) | ||
21 | { | ||
22 | throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename)); | ||
23 | } | ||
24 | |||
25 | std::ifstream file(filename); | ||
26 | std::string line; | ||
27 | std::vector<std::string> urls; | ||
28 | while (std::getline(file, line)) | ||
29 | { | ||
30 | if (!line.empty()) | ||
31 | { | ||
32 | urls.push_back(line); | ||
33 | } | ||
34 | } | ||
35 | |||
36 | // output, extension | ||
37 | std::vector<std::tuple<std::string, std::string>> results; | ||
38 | while (!urls.empty() && results.size() < num) | ||
39 | { | ||
40 | int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng); | ||
41 | std::string url = urls.at(index); | ||
42 | urls.erase(std::begin(urls) + index); | ||
43 | |||
44 | // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists | ||
45 | if (url.find("willyfogg.com/thumb.php") != std::string::npos) | ||
46 | { | ||
47 | continue; | ||
48 | } | ||
49 | |||
50 | // Accept string from Google Chrome | ||
51 | std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; | ||
52 | curl::curl_header headers; | ||
53 | headers.add(accept); | ||
54 | |||
55 | std::ostringstream imgbuf; | ||
56 | curl::curl_ios<std::ostringstream> imgios(imgbuf); | ||
57 | curl::curl_easy imghandle(imgios); | ||
58 | |||
59 | imghandle.add<CURLOPT_HTTPHEADER>(headers.get()); | ||
60 | imghandle.add<CURLOPT_URL>(url.c_str()); | ||
61 | imghandle.add<CURLOPT_CONNECTTIMEOUT>(30); | ||
62 | imghandle.add<CURLOPT_TIMEOUT>(300); | ||
63 | |||
64 | try | ||
65 | { | ||
66 | imghandle.perform(); | ||
67 | } catch (const curl::curl_easy_exception& error) { | ||
68 | error.print_traceback(); | ||
69 | |||
70 | continue; | ||
71 | } | ||
72 | |||
73 | if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200) | ||
74 | { | ||
75 | continue; | ||
76 | } | ||
77 | |||
78 | std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get(); | ||
79 | if (content_type.substr(0, 6) != "image/") | ||
80 | { | ||
81 | continue; | ||
82 | } | ||
83 | |||
84 | results.emplace_back(imgbuf.str(), url.substr(url.rfind(".") + 1)); | ||
85 | } | ||
86 | |||
87 | if (results.size() < num) | ||
88 | { | ||
89 | throw std::invalid_argument(std::string("Not enough valid urls found for ") + std::string(filename)); | ||
90 | } | ||
91 | |||
92 | return results; | ||
93 | } | ||