summary refs log tree commit diff stats
path: root/imagenet.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'imagenet.cpp')
-rw-r--r--imagenet.cpp89
1 files changed, 89 insertions, 0 deletions
diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..3a107bd --- /dev/null +++ b/imagenet.cpp
@@ -0,0 +1,89 @@
1#include "imagenet.h"
2#include <fstream>
3#include <stdexcept>
4#include <sstream>
5#include <vector>
6#include <curl_easy.h>
7#include <curl_header.h>
8
9imagenet::imagenet(std::string path) : path_(path) {}
10
11std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const
12{
13 std::filesystem::path filename = path_ / std::to_string(notion_id);
14 if (!std::filesystem::exists(filename))
15 {
16 throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename));
17 }
18
19 std::ifstream file(filename);
20 std::string line;
21 std::vector<std::string> urls;
22 while (std::getline(file, line))
23 {
24 if (!line.empty())
25 {
26 urls.push_back(line);
27 }
28 }
29
30 std::string output;
31 std::string extension;
32 while (!urls.empty())
33 {
34 int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng);
35 std::string url = urls.at(index);
36 urls.erase(std::begin(urls) + index);
37
38 // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists
39 if (url.find("willyfogg.com/thumb.php") != std::string::npos)
40 {
41 continue;
42 }
43
44 // Accept string from Google Chrome
45 std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
46 curl::curl_header headers;
47 headers.add(accept);
48
49 std::ostringstream imgbuf;
50 curl::curl_ios<std::ostringstream> imgios(imgbuf);
51 curl::curl_easy imghandle(imgios);
52
53 imghandle.add<CURLOPT_HTTPHEADER>(headers.get());
54 imghandle.add<CURLOPT_URL>(url.c_str());
55 imghandle.add<CURLOPT_CONNECTTIMEOUT>(30);
56 imghandle.add<CURLOPT_TIMEOUT>(300);
57
58 try
59 {
60 imghandle.perform();
61 } catch (const curl::curl_easy_exception& error) {
62 error.print_traceback();
63
64 continue;
65 }
66
67 if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200)
68 {
69 continue;
70 }
71
72 std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get();
73 if (content_type.substr(0, 6) != "image/")
74 {
75 continue;
76 }
77
78 output = imgbuf.str();
79 extension = url.substr(url.rfind(".") + 1);
80 break;
81 }
82
83 if (output.empty())
84 {
85 throw std::invalid_argument(std::string("No valid urls found for ") + std::string(filename));
86 }
87
88 return {output, extension};
89}