about summary refs log tree commit diff stats
path: root/imagenet.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'imagenet.cpp')
-rw-r--r--imagenet.cpp93
1 files changed, 93 insertions, 0 deletions
diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..d5aeada --- /dev/null +++ b/imagenet.cpp
@@ -0,0 +1,93 @@
1#include "imagenet.h"
2#include <fstream>
3#include <stdexcept>
4#include <sstream>
5#include <vector>
6#include <curl_easy.h>
7#include <curl_header.h>
8
9imagenet::imagenet(std::string path) : path_(path) {}
10
11std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const
12{
13 auto result = getImagesForNotion(notion_id, rng, 1);
14 return result[0];
15}
16
17std::vector<std::tuple<std::string, std::string>> imagenet::getImagesForNotion(int notion_id, std::mt19937& rng, int num) const
18{
19 std::filesystem::path filename = path_ / std::to_string(notion_id);
20 if (!std::filesystem::exists(filename))
21 {
22 throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename));
23 }
24
25 std::ifstream file(filename);
26 std::string line;
27 std::vector<std::string> urls;
28 while (std::getline(file, line))
29 {
30 if (!line.empty())
31 {
32 urls.push_back(line);
33 }
34 }
35
36 // output, extension
37 std::vector<std::tuple<std::string, std::string>> results;
38 while (!urls.empty() && results.size() < num)
39 {
40 int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng);
41 std::string url = urls.at(index);
42 urls.erase(std::begin(urls) + index);
43
44 // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists
45 if (url.find("willyfogg.com/thumb.php") != std::string::npos)
46 {
47 continue;
48 }
49
50 // Accept string from Google Chrome
51 std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
52 curl::curl_header headers;
53 headers.add(accept);
54
55 std::ostringstream imgbuf;
56 curl::curl_ios<std::ostringstream> imgios(imgbuf);
57 curl::curl_easy imghandle(imgios);
58
59 imghandle.add<CURLOPT_HTTPHEADER>(headers.get());
60 imghandle.add<CURLOPT_URL>(url.c_str());
61 imghandle.add<CURLOPT_CONNECTTIMEOUT>(30);
62 imghandle.add<CURLOPT_TIMEOUT>(300);
63
64 try
65 {
66 imghandle.perform();
67 } catch (const curl::curl_easy_exception& error) {
68 error.print_traceback();
69
70 continue;
71 }
72
73 if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200)
74 {
75 continue;
76 }
77
78 std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get();
79 if (content_type.substr(0, 6) != "image/")
80 {
81 continue;
82 }
83
84 results.emplace_back(imgbuf.str(), url.substr(url.rfind(".") + 1));
85 }
86
87 if (results.size() < num)
88 {
89 throw std::invalid_argument(std::string("Not enough valid urls found for ") + std::string(filename));
90 }
91
92 return results;
93}