A scrapper based on nodejs, written in coffeescript and using cheerio
and request
packages
This is an equivalent of the pjscrape exemple but really faster: http://nrabinowitz.github.com/pjscrape/#overview
A scrapper based on nodejs, written in coffeescript and using cheerio
and request
packages
This is an equivalent of the pjscrape exemple but really faster: http://nrabinowitz.github.com/pjscrape/#overview
{ | |
"author": "", | |
"name": "scrapping", | |
"version": "0.0.0", | |
"repository": { | |
"url": "" | |
}, | |
"engines": { | |
"node": "~0.4.12" | |
}, | |
"dependencies": { | |
"request": "2.2.x", | |
"cheerio": "0.2.x", | |
"colors": "0.5.x", | |
"underscore": "1.2.x", | |
"ent": "0.0.x" | |
}, | |
"devDependencies": {} | |
} |
request = require 'request' | |
jsdom = require 'jsdom' | |
cheerio = require 'cheerio' | |
sys = require 'sys' | |
colors = require 'colors' | |
_ = require 'underscore' | |
ent = require 'ent' | |
handleError = (err) -> | |
if err | |
err = err.message if err.message | |
err = err.statusCode if err.statusCode | |
console.error 'Error', err.red | |
process.exit -1 | |
scrapp = (url, cb)-> | |
cb or= handleError | |
console.log '>', url | |
request url: url, proxy: process.env['http_proxy'], (err, resp, body) -> | |
return cb err if err | |
return cb resp if resp.statusCode != 200 | |
$ = cheerio.load body | |
cb null, $, url | |
scrapp 'http://en.wikipedia.org/wiki/List_of_towns_in_Vermont', (err, $, url)-> | |
return handleError err if err | |
[protocol, n,host] = url.split('/') | |
baseUrl = "#{protocol}//#{host}" | |
moreUrl = _.values($('td a')).map (it)-> it.attribs.href | |
moreUrl = _.filter moreUrl, (it)-> it.indexOf('http') is -1 | |
moreUrl = moreUrl.map (it)-> baseUrl + it | |
moreUrl.forEach (url)-> scrapp url, (err, $, url)-> | |
return console.error err if err | |
elevation = '' | |
$('td').each (i, elem)-> | |
elevation = $(elem).siblings().text() if $(elem).text().indexOf('Elevation') != -1 | |
console.log name: $('#firstHeading').text(), elevation: ent.decode elevation |