Skip to content

Instantly share code, notes, and snippets.

@Minoru
Created November 11, 2017 11:00
Show Gist options
  • Save Minoru/3e6c56d73f61b140af3d63546de0b2e7 to your computer and use it in GitHub Desktop.
Save Minoru/3e6c56d73f61b140af3d63546de0b2e7 to your computer and use it in GitHub Desktop.
Extract <link rel="alternate"> feeds from a webpage (by @noctux; https://paste.xinu.at/RRug/)
#include <curl/curl.h>
#include <cstddef>
#include <iostream>
#include <vector>
#include <cstring>
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
static size_t my_write_data(void *buffer, size_t size, size_t nmemb, void *userp) {
std::string * pbuf = static_cast<std::string *>(userp);
pbuf->append(static_cast<const char *>(buffer), size * nmemb);
return size * nmemb;
}
std::string retrieve_url(
const std::string& url,
void * cfgcont,
const std::string& authinfo,
const std::string* postdata)
{
std::string buf;
CURL * easyhandle = curl_easy_init();
curl_easy_setopt(easyhandle, CURLOPT_URL, url.c_str());
curl_easy_setopt(easyhandle, CURLOPT_WRITEFUNCTION, my_write_data);
curl_easy_setopt(easyhandle, CURLOPT_WRITEDATA, &buf);
if (postdata != nullptr) {
curl_easy_setopt(easyhandle, CURLOPT_POST, 1);
curl_easy_setopt(easyhandle, CURLOPT_POSTFIELDS, postdata->c_str());
}
curl_easy_perform(easyhandle);
curl_easy_cleanup(easyhandle);
return buf;
}
// match for: <link rel="alternate" type="application/rss+xml" title="..." href="https://..." />
std::vector<std::string> extract_rss_urls(std::string& url, std::string& html) {
// The options will make the extractor quite permissive
htmlDocPtr doc = htmlReadMemory(html.c_str(), html.size(), url.c_str(), NULL, HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET);
std::vector<std::string> feeds;
if (!doc) {
// TODO: Better error handling, use LOGGER, ...
std::cerr << "Failed to parse" << std::endl;
return feeds;
}
/* Create xpath evaluation context */
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
if(xpathCtx == NULL) {
std::cerr << "Error: unable to create new XPath context" << std::endl;
xmlFreeDoc(doc);
return feeds;
}
const char *xpathExpr = "/html/head/link[@rel='alternate' and (@type='application/atom+xml' or @type='application/rss+xml')]";
xmlXPathObjectPtr xpathObj = xmlXPathEvalExpression((const xmlChar *) xpathExpr, xpathCtx);
if(xpathObj == NULL) {
std::cerr << "Error: unable to evaluate xpath expression \"" << xpathExpr << "\"" << std::endl;
xmlXPathFreeContext(xpathCtx);
xmlFreeDoc(doc);
return feeds;
}
xmlNodeSetPtr nodes = xpathObj->nodesetval;
for(unsigned int i = 0; i < nodes->nodeNr; i++) {
xmlNode *cur = nodes->nodeTab[i];
xmlChar *href = xmlGetProp(cur, (const xmlChar *) "href");
if (href) {
feeds.push_back(std::string((char *) href));
xmlFree(href);
}
}
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
xmlFreeDoc(doc); // free document
xmlCleanupParser(); // Free globals
return feeds;
}
int main(int argc, char const* argv[])
{
if(argc < 2) {
std::cerr << "usage: extract-url <url>" << std::endl;
return 1;
}
std::string url = argv[1];
std::string html = retrieve_url(url, NULL, "", NULL);
for (auto rel : extract_rss_urls(url, html)) {
std::cout << "Rel: " << rel << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment