Created
February 8, 2019 21:48
-
-
Save oookoook/afb6a819dca3ed14be7b58bc6cb0e8bd to your computer and use it in GitHub Desktop.
Apify crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Apify = require('apify'); | |
const accents = require('remove-accents'); | |
function updateLocation(location, offer, type) { | |
//console.log('update called for:'); | |
//console.dir(offer); | |
if(!(offer.price && offer.area) || !offer.pricePerMeter ) { | |
console.log('ignored - insufficient data'); | |
} | |
if(!offer.pricePerMeter) { | |
offer.pricePerMeter = offer.price / offer.area; | |
} | |
if(!location) { | |
location = { offers: {}}; | |
} | |
//removeOld(); | |
// the last part of the url | |
var id = 'o' + offer.url.split('/').slice(-1)[0]; | |
location.offers[id] = { | |
area: offer.area, | |
price: offer.price, | |
pricePerMeter: offer.pricePerMeter, | |
updated: new Date() | |
}; | |
var oVals = Object.values(location.offers); | |
location.avg = oVals.reduce(function(total, item) { return total + item.pricePerMeter; }, 0) / oVals.length; | |
//console.log('Updated location:'); | |
//console.dir(location); | |
return location; | |
} | |
Apify.main(async () => { | |
// Get input of your actor | |
const input = await Apify.getValue('INPUT'); | |
// the input is a finish trigger from crawler | |
const crawlResults = await Apify.client.crawlers.getExecutionResults({ executionId: input._id }); | |
const commonStore = await Apify.openKeyValueStore('priceMap'); | |
const stores = { | |
county: commonStore, //await Apify.openKeyValueStore('priceMap-county'), | |
city: commonStore, //await Apify.openKeyValueStore('priceMap-city'), | |
district: commonStore, //await Apify.openKeyValueStore('priceMap-district'), | |
street: commonStore, //await Apify.openKeyValueStore('priceMap-street') | |
}; | |
// foreach and async are behaving inconveniently | |
//await input.forEach( async p => { await p.pageFunctionResult.forEach( async o => { await parseOffer(o, stores); }) }); | |
const total = crawlResults.items.length; | |
for (var i = 0; i < total; i++) { | |
console.log('Processed ' + i + ' from ' + total + ' pages.'); | |
var page = crawlResults.items[i]; | |
for(var j = 0; j < page.pageFunctionResult.length; j++) { | |
await parseOffer(page.pageFunctionResult[j], stores); | |
} | |
} | |
console.log('Keystore updated.'); | |
}); | |
crawlResults.items.length; | |
async function parseOffer(offer, stores, type) { | |
if(!type) { | |
await parseOffer(offer, stores, 'county'); | |
await parseOffer(offer, stores, 'city'); | |
await parseOffer(offer, stores, 'district'); | |
await parseOffer(offer, stores, 'street'); | |
return; | |
} | |
var id = getId(offer, type); | |
if(!id || id === '') { | |
return; | |
} | |
//console.dir(offer); | |
//console.log(id); | |
//console.dir(stores[type]); | |
var loc = await stores[type].getValue(id); | |
//console.log('Location from store:'); | |
//console.dir(loc); | |
loc = updateLocation(loc, offer, type); | |
await stores[type].setValue(id, loc); | |
} | |
function getId(offer, level) { | |
var id; | |
var delimiter = '!'; | |
var spaceRep = '_'; | |
if(level == 0 || level == 'county') { | |
id = offer.location.county ? offer.location.county : ''; | |
} | |
if(level == 1 || level == 'city') { | |
id = offer.location.city ? getId(offer, 0) + delimiter + offer.location.city : ''; | |
} | |
if(level == 2 || level == 'district') { | |
id = offer.location.district ? getId(offer, 1) + delimiter + offer.location.district : ''; | |
} | |
if(level == 3 || level == 'street') { | |
id = offer.location.street ? getId(offer, 2) + delimiter + offer.location.street : ''; | |
} | |
return accents.remove(id).replace(/ /g, spaceRep); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// start urls: https://www.sreality.cz/hledani/prodej/pozemky/stavebni-parcely?strana=1&bez-aukce=1 | |
// pseudo-URLs: https://www.sreality.cz/hledani/prodej/pozemky/stavebni-parcely?strana=[\d*]&bez-aukce=1 | |
function pageFunction(context) { | |
// called on every page the crawler visits, use it to extract data from it | |
var $ = context.jQuery; | |
var startedAt = Date.now(); | |
var extractData = function() { | |
// timeout after 10 seconds | |
if( Date.now() - startedAt > 10000 ) { | |
context.finish("Timed out before offers was loaded"); | |
return; | |
} | |
var offers = $('div.info').toArray(); | |
// if my element still hasn't been loaded, wait a little more | |
if( offers.length === 0 ) { | |
setTimeout(extractData, 500); | |
return; | |
} | |
// refresh page screenshot and HTML for debugging | |
context.saveSnapshot(); | |
var result = []; | |
offers.forEach(function(i) { | |
var tprice = parseFloat($(i).find('span.norm-price').text() | |
.replace(/\u00a0/g, '') | |
.replace(/Kč/g, '') | |
.replace(/\./g, '')); | |
var mprice = parseFloat($(i).find('span.alt-price').text() | |
.replace(/\u00a0/g, '') | |
.replace(/\Kč za m²\)/g, '') | |
.replace(/\(/g, '') | |
.replace(/\./g, '')); | |
var iarea = parseFloat($(i).find('span.name').text() | |
.replace(/ /g, '') | |
.replace(/\u00a0/g, '') | |
.replace(/Prodejstavebníhopozemku/g,'') | |
.replace(/m²/g, '') | |
.replace(/\./g, '')); | |
var iurl = $(i).find('a.title').attr('href'); | |
var iloc = $(i).find('span.locality').text(); | |
var aloc = iloc.split(/( - |, )/i); | |
var ploc = {}; | |
aloc.forEach(function(e) { | |
if(e == ' - ' || e == ', ') { | |
// skip | |
} else if(e.indexOf('ulice') > -1) { | |
ploc.street = e.replace(/ulice/g, '').trim(); | |
} else if(e.indexOf('část obce') > -1) { | |
ploc.district = e.replace(/část obce/g, '').trim(); | |
} else if(e.indexOf('okres') > -1) { | |
ploc.county = e.replace(/okres/g, '').trim(); | |
} else { | |
ploc.city = e.trim(); | |
} | |
}); | |
result.push({ | |
url: 'https://www.sreality.cz' + iurl, | |
area: iarea, | |
price: tprice, | |
pricePerMeter: mprice, | |
location: ploc | |
}); | |
}); | |
context.finish(result); | |
}; | |
// tell the crawler that pageFunction will finish asynchronously | |
context.willFinishLater(); | |
extractData(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment