Trying to make a small scrapy like app with nodejs
var scrapy = require('./scrapy.js').scrapy, | |
getAdFromTD = function( $td ) { | |
var ad = { | |
title: $td.find('a').html().replace(/\s*$/g, '') | |
}, | |
type; | |
$td = $td.next(); | |
ad.address = $td.find('a').html().replace( /<br>/g, ',' ); | |
$td = $td.next(); | |
type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''), | |
ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g, '' ); | |
ad.floor = type[1].replace( /.\s Floor/g, '' ); | |
ad.space = type[2].replace( /[\n|\s]+$/g, '' ); | |
$td = $td.next(); | |
type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''); | |
ad.type = type[0].replace( /[\n|\s]+$/g, '' ); | |
ad.build = type[1].replace( /[\n|\s]+$/g, '' ); | |
ad.price = type[2]; | |
return ad; | |
}, | |
ads = []; | |
scrapy( { | |
url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=", | |
getNextUrl: function( $ ) { | |
return $('a.forward.iconLink').attr( 'href' ); | |
}, | |
filterPage: function( $ ) { | |
$('#objectList tr') | |
.each( function() { | |
var $td = $( this ).find( '.tdTitle' ), | |
ad; | |
if ( $td.size() == 1) { | |
ad = getAdFromTD( $td ); | |
ads.push( ad ); | |
console.log( ad ); | |
} | |
}); | |
console.log( 'Found ' + ads.length); | |
}, | |
done: function() { | |
console.log( 'Done! found ' + ads.length + ' ads in total!'); | |
} | |
} ); |
var jsdom = require( 'jsdom' ), | |
//fs = require( 'fs' ) | |
//underscore = fs.readFileSync( './underscore.js' ).toString(), | |
scrapy = function( conf, counter ) { | |
var url = conf.url || null, | |
getNextUrl = conf.getNextUrl || null, | |
filterPage = conf.filterPage || null, | |
finalDone = conf.done || null, | |
counter = counter || 1, | |
done = function() { | |
counter--; | |
if ( counter === 0 && finalDone !== null) { | |
finalDone(); | |
} | |
}; | |
jsdom.env({ | |
html: url, | |
scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ], | |
done: function( errors, window ) { | |
if ( errors ) { | |
console.log( errors ); | |
} | |
var $ = window.$, | |
nexturl; | |
// First of all try to scrap the next url available | |
if ( getNextUrl ) { | |
nexturl = getNextUrl( $ ); | |
if ( nexturl ) { | |
scrapy( { | |
url: nexturl, //getNextUrl( $ ), | |
getNextUrl: getNextUrl, | |
filterPage: filterPage, | |
done: done }, counter++ ); | |
} | |
} | |
if ( filterPage ) { | |
filterPage( $ ); | |
} | |
else { | |
console.log( 'No filterPage available' ); | |
} | |
done(); | |
} | |
}); | |
//console.log( 'Done with scrap of ' + url); | |
}; | |
exports.scrapy = scrapy; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment