Skip to content

Instantly share code, notes, and snippets.

@hechen0
Created December 1, 2013 09:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hechen0/7730214 to your computer and use it in GitHub Desktop.
Save hechen0/7730214 to your computer and use it in GitHub Desktop.
first to create web crawler with node js
var http = require('http'),
cheerio = require('cheerio');
startUrl = 'http://www.renren.com';
var urls = [startUrl];
var tags = ["div","h1","h2","h3","h4","h5","img","a"]
// Utility function that downloads a URL and invokes callback
function download(url, callback){
http.get(url, function(res){
var data="";
res.on('data', function(chunk){
data += chunk;
});
res.on('end', function(){
callback(data);
});
}).on('error', function(){
callback(null);
});
}
function seriesDownloader(urls,counter, callback){
var urlPattern = /^(https?:\/\/)([\da-z\.-]+).\.([a-z\.]{2,10})([\/\w\.-]*)*\/?$/;
download(urls[counter],function(data){
if(data){
console.log("---------"+urls[counter]+"----------");
var $ = cheerio.load(data);
callback($)
// grape next url
counter++;
if(!!urls[counter]){
seriesDownloader(urls,counter);
}else{
console.log("-----end of crawler------");
return;
}
}else{
console.log("error");
}
});
}
function analy($){
var result = {}
tags.forEach(function(e){
result[e] = $(e).length
})
console.log(result)
}
seriesDownloader(urls, 0, analy);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment