Skip to content

Instantly share code, notes, and snippets.

@oookoook
Created February 8, 2019 21:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oookoook/afb6a819dca3ed14be7b58bc6cb0e8bd to your computer and use it in GitHub Desktop.
Save oookoook/afb6a819dca3ed14be7b58bc6cb0e8bd to your computer and use it in GitHub Desktop.
Apify crawler
const Apify = require('apify');
const accents = require('remove-accents');
function updateLocation(location, offer, type) {
//console.log('update called for:');
//console.dir(offer);
if(!(offer.price && offer.area) || !offer.pricePerMeter ) {
console.log('ignored - insufficient data');
}
if(!offer.pricePerMeter) {
offer.pricePerMeter = offer.price / offer.area;
}
if(!location) {
location = { offers: {}};
}
//removeOld();
// the last part of the url
var id = 'o' + offer.url.split('/').slice(-1)[0];
location.offers[id] = {
area: offer.area,
price: offer.price,
pricePerMeter: offer.pricePerMeter,
updated: new Date()
};
var oVals = Object.values(location.offers);
location.avg = oVals.reduce(function(total, item) { return total + item.pricePerMeter; }, 0) / oVals.length;
//console.log('Updated location:');
//console.dir(location);
return location;
}
Apify.main(async () => {
// Get input of your actor
const input = await Apify.getValue('INPUT');
// the input is a finish trigger from crawler
const crawlResults = await Apify.client.crawlers.getExecutionResults({ executionId: input._id });
const commonStore = await Apify.openKeyValueStore('priceMap');
const stores = {
county: commonStore, //await Apify.openKeyValueStore('priceMap-county'),
city: commonStore, //await Apify.openKeyValueStore('priceMap-city'),
district: commonStore, //await Apify.openKeyValueStore('priceMap-district'),
street: commonStore, //await Apify.openKeyValueStore('priceMap-street')
};
// foreach and async are behaving inconveniently
//await input.forEach( async p => { await p.pageFunctionResult.forEach( async o => { await parseOffer(o, stores); }) });
const total = crawlResults.items.length;
for (var i = 0; i < total; i++) {
console.log('Processed ' + i + ' from ' + total + ' pages.');
var page = crawlResults.items[i];
for(var j = 0; j < page.pageFunctionResult.length; j++) {
await parseOffer(page.pageFunctionResult[j], stores);
}
}
console.log('Keystore updated.');
});
crawlResults.items.length;
async function parseOffer(offer, stores, type) {
if(!type) {
await parseOffer(offer, stores, 'county');
await parseOffer(offer, stores, 'city');
await parseOffer(offer, stores, 'district');
await parseOffer(offer, stores, 'street');
return;
}
var id = getId(offer, type);
if(!id || id === '') {
return;
}
//console.dir(offer);
//console.log(id);
//console.dir(stores[type]);
var loc = await stores[type].getValue(id);
//console.log('Location from store:');
//console.dir(loc);
loc = updateLocation(loc, offer, type);
await stores[type].setValue(id, loc);
}
function getId(offer, level) {
var id;
var delimiter = '!';
var spaceRep = '_';
if(level == 0 || level == 'county') {
id = offer.location.county ? offer.location.county : '';
}
if(level == 1 || level == 'city') {
id = offer.location.city ? getId(offer, 0) + delimiter + offer.location.city : '';
}
if(level == 2 || level == 'district') {
id = offer.location.district ? getId(offer, 1) + delimiter + offer.location.district : '';
}
if(level == 3 || level == 'street') {
id = offer.location.street ? getId(offer, 2) + delimiter + offer.location.street : '';
}
return accents.remove(id).replace(/ /g, spaceRep);
}
// start urls: https://www.sreality.cz/hledani/prodej/pozemky/stavebni-parcely?strana=1&bez-aukce=1
// pseudo-URLs: https://www.sreality.cz/hledani/prodej/pozemky/stavebni-parcely?strana=[\d*]&bez-aukce=1
function pageFunction(context) {
// called on every page the crawler visits, use it to extract data from it
var $ = context.jQuery;
var startedAt = Date.now();
var extractData = function() {
// timeout after 10 seconds
if( Date.now() - startedAt > 10000 ) {
context.finish("Timed out before offers was loaded");
return;
}
var offers = $('div.info').toArray();
// if my element still hasn't been loaded, wait a little more
if( offers.length === 0 ) {
setTimeout(extractData, 500);
return;
}
// refresh page screenshot and HTML for debugging
context.saveSnapshot();
var result = [];
offers.forEach(function(i) {
var tprice = parseFloat($(i).find('span.norm-price').text()
.replace(/\u00a0/g, '')
.replace(/Kč/g, '')
.replace(/\./g, ''));
var mprice = parseFloat($(i).find('span.alt-price').text()
.replace(/\u00a0/g, '')
.replace(/\Kč za m²\)/g, '')
.replace(/\(/g, '')
.replace(/\./g, ''));
var iarea = parseFloat($(i).find('span.name').text()
.replace(/ /g, '')
.replace(/\u00a0/g, '')
.replace(/Prodejstavebníhopozemku/g,'')
.replace(/m²/g, '')
.replace(/\./g, ''));
var iurl = $(i).find('a.title').attr('href');
var iloc = $(i).find('span.locality').text();
var aloc = iloc.split(/( - |, )/i);
var ploc = {};
aloc.forEach(function(e) {
if(e == ' - ' || e == ', ') {
// skip
} else if(e.indexOf('ulice') > -1) {
ploc.street = e.replace(/ulice/g, '').trim();
} else if(e.indexOf('část obce') > -1) {
ploc.district = e.replace(/část obce/g, '').trim();
} else if(e.indexOf('okres') > -1) {
ploc.county = e.replace(/okres/g, '').trim();
} else {
ploc.city = e.trim();
}
});
result.push({
url: 'https://www.sreality.cz' + iurl,
area: iarea,
price: tprice,
pricePerMeter: mprice,
location: ploc
});
});
context.finish(result);
};
// tell the crawler that pageFunction will finish asynchronously
context.willFinishLater();
extractData();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment