Skip to content

Instantly share code, notes, and snippets.

@GuillaumeLeclerc
Created February 23, 2015 15:13
Show Gist options
  • Save GuillaumeLeclerc/74658a6bb4eeb9fb3b16 to your computer and use it in GitHub Desktop.
Save GuillaumeLeclerc/74658a6bb4eeb9fb3b16 to your computer and use it in GitHub Desktop.
var async = require("async");
var cheerio = require("cheerio");
var request = require("request");
var uri = require("uri-js");
var _ = require("lodash");
var fs = require("fs");
var done = {};
var toSave = [ "image", "application", "audio", "video" ];
var toParse = [ "text/html" ];
var output = "./out.data";
var pushNew = function(index, element, currentTask, tag) {
var nextUrl = uri.resolve(currentTask, element.attribs[tag]);
var protocol = uri.parse(nextUrl).scheme;
if (protocol.substring(0, 4) === "http" && ! (nextUrl in done)) {
q.push(nextUrl);
}
}
var isContentType = function(possibilities, response) {
if (response && response.headers && response.headers["content-type"]) {
return _.some(possibilities, function(element) {
return response.headers["content-type"].indexOf(element) !== -1
})
} else {
return false;
}
}
var handleQueue = function(task, callback) {
task = uri.normalize(task);
done[task] = true;
console.log("requesting " + task);
request({
method : "HEAD",
url : task
}, function(error, response) {
var todo = [];
if(isContentType(toParse, response)) {
todo.push(_.partial(downloadAndParse, task));
}
if (isContentType(toSave, response)) {
todo.push(_.partial(saveUrl, task));
}
async.parallel(todo, callback);
});
}
var downloadAndParse = function(task, callback) {
request(task, function(error, response, body) {
var dom = cheerio.load(body);
dom("img").each(_.partialRight(pushNew, task, "src"));
dom("a").each(_.partialRight(pushNew, task, "href"));
callback(null);
});
}
var saveUrl = function(task, callback) {
console.log("saving " + task);
var host = uri.parse(task).host;
var text = JSON.stringify({
url : task,
host : host
});
fs.appendFile(output, text + "\n", callback);
}
var q = async.queue(handleQueue, 10);
q.push("http://www.epfl.ch");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment