Skip to content

Instantly share code, notes, and snippets.

@matthewmorrone
Created December 27, 2014 06:37
Show Gist options
  • Save matthewmorrone/31fd6e60818d0f34c018 to your computer and use it in GitHub Desktop.
Save matthewmorrone/31fd6e60818d0f34c018 to your computer and use it in GitHub Desktop.
my first attempt at a web crawler in node... obviously I have no idea how to use asynchronicity
var http = require("http");
var fs = require("fs");
var crawler = require("simplecrawler");
var cheerio = require("cheerio");
var util = require('util');
var exit = require('exit');
var log = console.log.bind(console);
function str(a) {return util.inspect(a, false, null);}
var url = "https://gist.github.com/matthewmorrone1?page=3";
crawler.initialProtocol = "https";
crawler.maxDepth = 1;
var out = 0;
crawler.crawl(url, function(queueItem, responseBuffer, response){
log(url);
if(out === 1) {
exit();
}
var $ = cheerio.load(responseBuffer);
var links = $("a").map(function() {
return [$(this).find("span strong").html(), +$(this).attr("href")];
}).filter(function() {
return this[0];
});
out++;
fs.writeFile("out.txt", /*str(queueItem)+"\n\n"+*/Array.prototype.join.call(links, "\n")/*+"\n\n"+str(response)*/, function(err) {
if(err) {
log(err);
} else {
log("The file was saved!");
}
});
}, function(queueItem) {
log("fail");
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment