Created
August 27, 2011 14:23
-
-
Save valotas/1175447 to your computer and use it in GitHub Desktop.
Trying to make a small scrapy like app with nodejs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var scrapy = require('./scrapy.js').scrapy, | |
getAdFromTD = function( $td ) { | |
var ad = { | |
title: $td.find('a').html().replace(/\s*$/g, '') | |
}, | |
type; | |
$td = $td.next(); | |
ad.address = $td.find('a').html().replace( /<br>/g, ',' ); | |
$td = $td.next(); | |
type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''), | |
ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g, '' ); | |
ad.floor = type[1].replace( /.\s Floor/g, '' ); | |
ad.space = type[2].replace( /[\n|\s]+$/g, '' ); | |
$td = $td.next(); | |
type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''); | |
ad.type = type[0].replace( /[\n|\s]+$/g, '' ); | |
ad.build = type[1].replace( /[\n|\s]+$/g, '' ); | |
ad.price = type[2]; | |
return ad; | |
}, | |
ads = []; | |
scrapy( { | |
url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=", | |
getNextUrl: function( $ ) { | |
return $('a.forward.iconLink').attr( 'href' ); | |
}, | |
filterPage: function( $ ) { | |
$('#objectList tr') | |
.each( function() { | |
var $td = $( this ).find( '.tdTitle' ), | |
ad; | |
if ( $td.size() == 1) { | |
ad = getAdFromTD( $td ); | |
ads.push( ad ); | |
console.log( ad ); | |
} | |
}); | |
console.log( 'Found ' + ads.length); | |
}, | |
done: function() { | |
console.log( 'Done! found ' + ads.length + ' ads in total!'); | |
} | |
} ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var jsdom = require( 'jsdom' ), | |
//fs = require( 'fs' ) | |
//underscore = fs.readFileSync( './underscore.js' ).toString(), | |
scrapy = function( conf, counter ) { | |
var url = conf.url || null, | |
getNextUrl = conf.getNextUrl || null, | |
filterPage = conf.filterPage || null, | |
finalDone = conf.done || null, | |
counter = counter || 1, | |
done = function() { | |
counter--; | |
if ( counter === 0 && finalDone !== null) { | |
finalDone(); | |
} | |
}; | |
jsdom.env({ | |
html: url, | |
scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ], | |
done: function( errors, window ) { | |
if ( errors ) { | |
console.log( errors ); | |
} | |
var $ = window.$, | |
nexturl; | |
// First of all try to scrap the next url available | |
if ( getNextUrl ) { | |
nexturl = getNextUrl( $ ); | |
if ( nexturl ) { | |
scrapy( { | |
url: nexturl, //getNextUrl( $ ), | |
getNextUrl: getNextUrl, | |
filterPage: filterPage, | |
done: done }, counter++ ); | |
} | |
} | |
if ( filterPage ) { | |
filterPage( $ ); | |
} | |
else { | |
console.log( 'No filterPage available' ); | |
} | |
done(); | |
} | |
}); | |
//console.log( 'Done with scrap of ' + url); | |
}; | |
exports.scrapy = scrapy; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment