1 files changed, 89 insertions, 0 deletions
diff --git a/imagenet.cpp b/imagenet.cpp
new file mode 100644
index 0000000..3a107bd
--- /dev/null
+++ b/imagenet.cpp

@@ -0,0 +1,89 @@
+#include "imagenet.h"
+#include <fstream>
+#include <stdexcept>
+#include <sstream>
+#include <vector>
+#include <curl_easy.h>
+#include <curl_header.h>
+imagenet::imagenet(std::string path) : path_(path) {}
+std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const
+{
+  std::filesystem::path filename = path_ / std::to_string(notion_id);
+  if (!std::filesystem::exists(filename))
+  {
+    throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename));
+  }
+  std::ifstream file(filename);
+  std::string line;
+  std::vector<std::string> urls;
+  while (std::getline(file, line))
+  {
+    if (!line.empty())
+    {
+      urls.push_back(line);
+    }
+  }
+  std::string output;
+  std::string extension;
+  while (!urls.empty())
+  {
+    int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng);
+    std::string url = urls.at(index);
+    urls.erase(std::begin(urls) + index);
+    // willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists
+    if (url.find("willyfogg.com/thumb.php") != std::string::npos)
+    {
+      continue;
+    }
+    // Accept string from Google Chrome
+    std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
+    curl::curl_header headers;
+    headers.add(accept);
+    std::ostringstream imgbuf;
+    curl::curl_ios<std::ostringstream> imgios(imgbuf);
+    curl::curl_easy imghandle(imgios);
+    imghandle.add<CURLOPT_HTTPHEADER>(headers.get());
+    imghandle.add<CURLOPT_URL>(url.c_str());
+    imghandle.add<CURLOPT_CONNECTTIMEOUT>(30);
+    imghandle.add<CURLOPT_TIMEOUT>(300);
+    try
+    {
+      imghandle.perform();
+    } catch (const curl::curl_easy_exception& error) {
+      error.print_traceback();
+      continue;
+    }
+    if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200)
+    {
+      continue;
+    }
+    std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get();
+    if (content_type.substr(0, 6) != "image/")
+    {
+      continue;
+    }
+    output = imgbuf.str();
+    extension = url.substr(url.rfind(".") + 1);
+    break;
+  }
+  if (output.empty())
+  {
+    throw std::invalid_argument(std::string("No valid urls found for ") + std::string(filename));
+  }
+  return {output, extension};
+}

diff --git a/imagenet.cpp b/imagenet.cpp new file mode 100644 index 0000000..3a107bd --- /dev/null +++ b/imagenet.cpp
@@ -0,0 +1,89 @@
	1	#include "imagenet.h"
	2	#include <fstream>
	3	#include <stdexcept>
	4	#include <sstream>
	5	#include <vector>
	6	#include <curl_easy.h>
	7	#include <curl_header.h>
	8
	9	imagenet::imagenet(std::string path) : path_(path) {}
	10
	11	std::tuple<std::string, std::string> imagenet::getImageForNotion(int notion_id, std::mt19937& rng) const
	12	{
	13	std::filesystem::path filename = path_ / std::to_string(notion_id);
	14	if (!std::filesystem::exists(filename))
	15	{
	16	throw std::invalid_argument(std::string("File does not exist: ") + std::string(filename));
	17	}
	18
	19	std::ifstream file(filename);
	20	std::string line;
	21	std::vector<std::string> urls;
	22	while (std::getline(file, line))
	23	{
	24	if (!line.empty())
	25	{
	26	urls.push_back(line);
	27	}
	28	}
	29
	30	std::string output;
	31	std::string extension;
	32	while (!urls.empty())
	33	{
	34	int index = std::uniform_int_distribution<int>(0, urls.size()-1)(rng);
	35	std::string url = urls.at(index);
	36	urls.erase(std::begin(urls) + index);
	37
	38	// willyfogg.com is a thumbnail generator known to return 200 even if the target image no longer exists
	39	if (url.find("willyfogg.com/thumb.php") != std::string::npos)
	40	{
	41	continue;
	42	}
	43
	44	// Accept string from Google Chrome
	45	std::string accept = "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8";
	46	curl::curl_header headers;
	47	headers.add(accept);
	48
	49	std::ostringstream imgbuf;
	50	curl::curl_ios<std::ostringstream> imgios(imgbuf);
	51	curl::curl_easy imghandle(imgios);
	52
	53	imghandle.add<CURLOPT_HTTPHEADER>(headers.get());
	54	imghandle.add<CURLOPT_URL>(url.c_str());
	55	imghandle.add<CURLOPT_CONNECTTIMEOUT>(30);
	56	imghandle.add<CURLOPT_TIMEOUT>(300);
	57
	58	try
	59	{
	60	imghandle.perform();
	61	} catch (const curl::curl_easy_exception& error) {
	62	error.print_traceback();
	63
	64	continue;
	65	}
	66
	67	if (imghandle.get_info<CURLINFO_RESPONSE_CODE>().get() != 200)
	68	{
	69	continue;
	70	}
	71
	72	std::string content_type = imghandle.get_info<CURLINFO_CONTENT_TYPE>().get();
	73	if (content_type.substr(0, 6) != "image/")
	74	{
	75	continue;
	76	}
	77
	78	output = imgbuf.str();
	79	extension = url.substr(url.rfind(".") + 1);
	80	break;
	81	}
	82
	83	if (output.empty())
	84	{
	85	throw std::invalid_argument(std::string("No valid urls found for ") + std::string(filename));
	86	}
	87
	88	return {output, extension};
	89	}