Skip to content

Instantly share code, notes, and snippets.

@Filirom1
Created November 18, 2011 17:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Filirom1/1377160 to your computer and use it in GitHub Desktop.
Save Filirom1/1377160 to your computer and use it in GitHub Desktop.
Scrapping Wikipedia example
{
"author": "",
"name": "scrapping",
"version": "0.0.0",
"repository": {
"url": ""
},
"engines": {
"node": "~0.4.12"
},
"dependencies": {
"request": "2.2.x",
"cheerio": "0.2.x",
"colors": "0.5.x",
"underscore": "1.2.x",
"ent": "0.0.x"
},
"devDependencies": {}
}
request = require 'request'
jsdom = require 'jsdom'
cheerio = require 'cheerio'
sys = require 'sys'
colors = require 'colors'
_ = require 'underscore'
ent = require 'ent'
handleError = (err) ->
if err
err = err.message if err.message
err = err.statusCode if err.statusCode
console.error 'Error', err.red
process.exit -1
scrapp = (url, cb)->
cb or= handleError
console.log '>', url
request url: url, proxy: process.env['http_proxy'], (err, resp, body) ->
return cb err if err
return cb resp if resp.statusCode != 200
$ = cheerio.load body
cb null, $, url
scrapp 'http://en.wikipedia.org/wiki/List_of_towns_in_Vermont', (err, $, url)->
return handleError err if err
[protocol, n,host] = url.split('/')
baseUrl = "#{protocol}//#{host}"
moreUrl = _.values($('td a')).map (it)-> it.attribs.href
moreUrl = _.filter moreUrl, (it)-> it.indexOf('http') is -1
moreUrl = moreUrl.map (it)-> baseUrl + it
moreUrl.forEach (url)-> scrapp url, (err, $, url)->
return console.error err if err
elevation = ''
$('td').each (i, elem)->
elevation = $(elem).siblings().text() if $(elem).text().indexOf('Elevation') != -1
console.log name: $('#firstHeading').text(), elevation: ent.decode elevation
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment