Skip to content

Instantly share code, notes, and snippets.

@oktal
Created August 25, 2013 21:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save oktal/6336347 to your computer and use it in GitHub Desktop.
Save oktal/6336347 to your computer and use it in GitHub Desktop.
A simple XML sitemap crawler using C++ Poco Libraries.
#include "Poco/DOM/DOMParser.h"
#include "Poco/DOM/Document.h"
#include "Poco/DOM/AutoPtr.h"
#include "Poco/SAX/InputSource.h"
#include "Poco/Net/HTTPClientSession.h"
#include "Poco/Net/HTTPRequest.h"
#include "Poco/Net/HTTPResponse.h"
#include "Poco/Net/DNS.h"
#include "Poco/DOM/ElementsByTagNameList.h"
#include <set>
#include <string>
#include <iostream>
#include <algorithm>
#include <tuple>
#include <cassert>
#include <thread>
#include <chrono>
#include <unistd.h>
using namespace std;
using namespace Poco::Net;
using namespace Poco::XML;
typedef vector<string> ChunkUrl;
const string DomainName = "www.domain.com";
const string SitemapPath = "/sitemap.xml";
const int Threads = 48;
set<string> parseSiteMap() {
set<string> urls;
HTTPClientSession session { DomainName };
HTTPRequest request(HTTPRequest::HTTP_GET, SitemapPath);
session.sendRequest(request);
HTTPResponse response;
auto &stream = session.receiveResponse(response);
InputSource src { stream };
DOMParser parser;
AutoPtr<Document> dom = parser.parse(&src);
auto list = dom->getElementsByTagName("loc");
const string http { "http://" };
for (unsigned long index = 0; index < list->length(); ++index) {
auto url = list->item(index)->innerText();
// Looks like POCO can't handle an host with http://, so we remove
// it from the URL.
if (!url.compare(0, http.size(), http)) {
url.erase(0, http.size());
}
urls.insert(move(url));
}
return urls;
}
// Splits a full URL in a (host, ressource) pair. For example,
// http://www.mydomain.com/index.html splits into
// (http://www.mydomain.com, /index.html)
pair<string, string> extractUrlDomain(const string &url) {
auto sepOffset = url.find('/');
if (sepOffset == string::npos) {
return make_pair(url, string { '/' });
} else {
auto domain = url.substr(0, sepOffset);
auto page = url.substr(sepOffset);
return make_pair(domain, page);
}
}
void doRequests(ChunkUrl &&chunk) {
for (const auto &url: chunk) {
string domain, page;
tie(domain, page) = extractUrlDomain(url);
try {
HTTPClientSession session(domain);
HTTPRequest request(HTTPRequest::HTTP_GET, page);
session.sendRequest(request);
HTTPResponse response;
session.receiveResponse(response);
auto status = response.getStatus();
if (status == HTTPResponse::HTTP_OK) {
printf("Fetched URL '%s' [HTTP_OK]\n",
url.c_str());
} else {
printf("Fetched URL '%s' FAIL [%d]\n",
url.c_str(),
static_cast<int>(status));
}
} catch (const exception &e) {
fprintf(stderr, "Fetched URL '%s' [NoAdressFoundException]\n",
url.c_str());
}
}
}
void launchThreads(const vector<ChunkUrl> &chunks) {
// We do not want to get less chunks than
// threads, otherwise boom.
assert(chunks.size() >= Threads);
vector<thread> threads;
threads.reserve(Threads);
// Let's create the threads
for (const auto &chunk: chunks) {
threads.push_back(thread(doRequests, move(chunk)));
}
// And let's now wait for all threads to finish
for (auto &thr: threads) {
thr.join();
}
}
vector<ChunkUrl> makeChunks(const set<string> &urls) {
const size_t chunksSize = urls.size() / Threads;
vector<ChunkUrl> chunks;
chunks.reserve(Threads);
auto chunkIt = begin(urls);
// Let's first handle the equal sized
// chunks
for (size_t i = 0; i < Threads - 1; ++i) {
ChunkUrl chunk;
chunk.reserve(chunksSize);
copy_n(chunkIt, chunksSize, back_inserter(chunk));
chunks.push_back(move(chunk));
advance(chunkIt, chunksSize);
}
// Let's now handle the last chunk
ChunkUrl lastChunk;
copy(chunkIt, end(urls), back_inserter(lastChunk));
chunks.push_back(move(lastChunk));
return chunks;
}
int main() {
auto urls = parseSiteMap();
auto chunks = makeChunks(urls);
launchThreads(chunks);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment