1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
#include "imagenet.h"
#include <fstream>
#include <stdexcept>
#include <sstream>
#include <vector>
#include <curl_easy.h>
#include <curl_header.h>
imagenet::imagenet(std::string path) : path_(path) {}
std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const
{
auto result = getImagesForNotion(notion_id, rng, 1);
return result[0];
}
std::vector<std::tuple<std::string, std::string>> imagenet::getImagesForNotion(int notion_id, std::mt19937& rng, int num) const
{
std::filesystem::path filename = path_ / std::to_string(notion_id);
if (!std::filesystem::exists(filename))
{
throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename));
}
std::ifstream file(filename);
std::string line;
std::vector<std::string> urls;
while (std::getline(file, line))
{
if (!line.empty())
{
urls.push_back(line);
}
}
// output, extension
std::vector<std::tuple<std::string, std::string>> results;
while (!urls.empty() && results.size() < num)
{
int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng);
std::string url = urls.at(index);
urls.erase(std::begin(urls) + index);
// willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists
if (url.find("willyfogg.com/thumb.php") != std::string::npos)
{
continue;
}
// Accept string from Google Chrome
std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
curl::curl_header headers;
headers.add(accept);
std::ostringstream imgbuf;
curl::curl_ios<std::ostringstream> imgios(imgbuf);
curl::curl_easy imghandle(imgios);
imghandle.add<CURLOPT_HTTPHEADER>(headers.get());
imghandle.add<CURLOPT_URL>(url.c_str());
imghandle.add<CURLOPT_CONNECTTIMEOUT>(30);
imghandle.add<CURLOPT_TIMEOUT>(300);
try
{
imghandle.perform();
} catch (const curl::curl_easy_exception& error) {
error.print_traceback();
continue;
}
if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200)
{
continue;
}
std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get();
if (content_type.substr(0, 6) != "image/")
{
continue;
}
results.emplace_back(imgbuf.str(), url.substr(url.rfind(".") + 1));
}
if (results.size() < num)
{
throw std::invalid_argument(std::string("Not enough valid urls found for ") + std::string(filename));
}
return results;
}
|