Skip to content

Instantly share code, notes, and snippets.

@JonathanMH JonathanMH/crawl.js
Created Dec 14, 2015

Embed
What would you like to do?
crawl a page a bunch of times
var fs = require('fs');
var async = require('async');
var request = require('request');
var cheerio = require('cheerio');
var _ = require('lodash');
var url = 'http://programmingexcuses.com';
var excuses = [];
module.exports.crawl = function(callback){
request(url, function (error, response, body) {
console.log('crawling');
if (!error && response.statusCode == 200) {
$ = cheerio.load(body) // Show the HTML for the Google homepage.
excuses.push($('.wrapper center').text());
callback();
}
});
}
var count = 0;
async.whilst(
function () {
return count < 400;
},
function (callback) {
module.exports.crawl(function(){
count++;
setTimeout(callback, 100);
})
},
function (err) {
console.log(_.unique(excuses))
}
);
{
"name": "excuses",
"version": "1.0.0",
"description": "",
"main": "index.js",
"dependencies": {
"async": "^1.5.0",
"cheerio": "^0.19.0",
"lodash": "^3.10.1",
"request": "^2.67.0"
},
"devDependencies": {},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.