Minoru/urlextract.cpp

## urlextract.cpp
#include <curl/curl.h>
#include <cstddef>
#include <iostream>
#include <vector>
#include <cstring>

#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>

static size_t my_write_data(void *buffer, size_t size, size_t nmemb, void *userp) {
	std::string * pbuf = static_cast<std::string *>(userp);
	pbuf->append(static_cast<const char *>(buffer), size * nmemb);
	return size * nmemb;
}

std::string retrieve_url(
		const std::string& url,
		void * cfgcont,
		const std::string& authinfo,
		const std::string* postdata)
{
	std::string buf;

	CURL * easyhandle = curl_easy_init();
	curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str());
	curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data);
	curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf);

	if (postdata != nullptr) {
		curl_easy_setopt(easyhandle, CURLOPT_POST, 1);
		curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str());
	}

	curl_easy_perform(easyhandle);
	curl_easy_cleanup(easyhandle);

	return buf;
}

// match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." />
std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) {
	// The options will make the extractor quite permissive
	htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);

	std::vector<std::string> feeds;

	if (!doc) {
		// TODO: Better error handling, use LOGGER, ...
		std::cerr << "Failed to parse" << std::endl;
		return feeds;
	}

    /* Create xpath evaluation context */
    xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
    if(xpathCtx == NULL) {
		std::cerr << "Error: unable to create new XPath context" << std::endl;
        xmlFreeDoc(doc);
		return feeds;
    }

	const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]";
	xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx);
    if(xpathObj == NULL) {
		std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl;
        xmlXPathFreeContext(xpathCtx);
        xmlFreeDoc(doc);
		return feeds;
    }

	xmlNodeSetPtr nodes = xpathObj->nodesetval;
	for(unsigned int i = 0; i < nodes->nodeNr; i++) {
		xmlNode *cur = nodes->nodeTab[i];
		xmlChar *href = xmlGetProp(cur, (const xmlChar *) "href");
		if (href) {
			feeds.push_back(std::string((char *) href));
			xmlFree(href);
		}
	}

	xmlXPathFreeObject(xpathObj);
    xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc);       // free document
    xmlCleanupParser();    // Free globals
    return feeds;

}

int main(int argc, char const* argv[])
{
	if(argc < 2) {
		std::cerr << "usage: extract-url <url>" << std::endl;
		return 1;
	}
	std::string url  = argv[1];
	std::string html = retrieve_url(url, NULL, "", NULL);
	for (auto rel : extract_rss_urls(url, html)) {
		std::cout << "Rel: " << rel << std::endl;
	}
	return 0;
}
	#include <curl/curl.h>
	#include <cstddef>
	#include <iostream>
	#include <vector>
	#include <cstring>

	#include <libxml/HTMLparser.h>
	#include <libxml/tree.h>
	#include <libxml/parser.h>
	#include <libxml/xpath.h>
	#include <libxml/xpathInternals.h>

	static size_t my_write_data(void buffer, size_t size, size_t nmemb, void userp) {
	std::string * pbuf = static_cast<std::string *>(userp);
	pbuf->append(static_cast<const char >(buffer), size nmemb);
	return size * nmemb;
	}

	std::string retrieve_url(
	const std::string& url,
	void * cfgcont,
	const std::string& authinfo,
	const std::string* postdata)
	{
	std::string buf;

	CURL * easyhandle = curl_easy_init();
	curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str());
	curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data);
	curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf);

	if (postdata != nullptr) {
	curl_easy_setopt(easyhandle, CURLOPT_POST, 1);
	curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str());
	}

	curl_easy_perform(easyhandle);
	curl_easy_cleanup(easyhandle);

	return buf;
	}

	// match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." />
	std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) {
	// The options will make the extractor quite permissive
	htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS \| HTML_PARSE_NOERROR \| HTML_PARSE_NOWARNING \| HTML_PARSE_NONET);

	std::vector<std::string> feeds;

	if (!doc) {
	// TODO: Better error handling, use LOGGER, ...
	std::cerr << "Failed to parse" << std::endl;
	return feeds;
	}

	/* Create xpath evaluation context */
	xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
	if(xpathCtx == NULL) {
	std::cerr << "Error: unable to create new XPath context" << std::endl;
	xmlFreeDoc(doc);
	return feeds;
	}

	const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]";
	xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx);
	if(xpathObj == NULL) {
	std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl;
	xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc);
	return feeds;
	}

	xmlNodeSetPtr nodes = xpathObj->nodesetval;
	for(unsigned int i = 0; i < nodes->nodeNr; i++) {
	xmlNode *cur = nodes->nodeTab[i];
	xmlChar href = xmlGetProp(cur, (const xmlChar ) "href");
	if (href) {
	feeds.push_back(std::string((char *) href));
	xmlFree(href);
	}
	}

	xmlXPathFreeObject(xpathObj);
	xmlXPathFreeContext(xpathCtx);
	xmlFreeDoc(doc); // free document
	xmlCleanupParser(); // Free globals
	return feeds;

	}

	int main(int argc, char const* argv[])
	{
	if(argc < 2) {
	std::cerr << "usage: extract-url <url>" << std::endl;
	return 1;
	}
	std::string url = argv[1];
	std::string html = retrieve_url(url, NULL, "", NULL);
	for (auto rel : extract_rss_urls(url, html)) {
	std::cout << "Rel: " << rel << std::endl;
	}
	return 0;
	}