Skip to content

Instantly share code, notes, and snippets.

@phpenterprise
Last active May 20, 2023 19:19
Show Gist options
  • Save phpenterprise/6144bc35b4ba464aa38a0d2be7d84e35 to your computer and use it in GitHub Desktop.
Save phpenterprise/6144bc35b4ba464aa38a0d2be7d84e35 to your computer and use it in GitHub Desktop.
OLX Data Extractor (Scraping)
(Olx = {
release: '1.0.5 RC',
j: $,
path: window.location.href,
settings: {
debug: true,
url: window.location.href.replace(/\.(\w+)\/.*/, '.$1') + '/vi/{id}?rec=h',
filename: "olx.csv",
split: 100,
speed: 700,
autostart: true
},
data: {
init: true,
page: 1,
pages: [],
current: null,
links: [],
rows: [],
ads: []
},
setEvents: function () {
this.getLinks();
},
getLinks: function () {
this.debug('get links');
var a = document.querySelectorAll('a[href][data-lurker_list_id]');
if (!a.length) {
return this.complete();
}
for (var i in a) {
if (typeof a[i] === 'object') {
var id = a[i].getAttribute('data-lurker_list_id');
this.data.links.push(id);
}
if (i === "values") {
this.debug('links found ' + this.data.links.length);
this.data.links.filter(Olx.unique);
this.compile(0);
}
}
if (!this.data.links) {
return this.complete();
}
},
compile: function (i) {
this.debug('compile rows');
if (this.data.rows.length === this.settings.split) {
this.export();
}
if (i >= (Olx.data.links.length - 1)) {
this.resetLinks();
return this.nextPage();
} else {
this.time = setTimeout(function () {
Olx.getLink(Olx.data.links[i], parseInt(i) + 1);
}, this.settings.speed);
}
},
getLink: function (a, i) {
if (!this.data.init) {
return false;
}
if (this.data.ads.includes(a)) {
return setTimeout(function () {
Olx.compile(i);
}, this.settings.speed);
}
Olx.data.ads.push(a);
var url = this.settings.url.replace('{id}', a);
Olx.debug('page ' + this.data.page + ' | link ' + a + ' (' + (i - 1) + '/' + this.data.links.length + ') | rows collected (' + this.data.rows.length + '/' + this.settings.split + ')');
this.ajax(url, function (b) {
document.querySelector('HTML').innerHTML = b;
}, function () {
Olx.extract(url, i);
});
},
unique: function (value, index, self) {
return self.indexOf(value) === index;
},
extract: function (url, i) {
this.debug('extract data');
var json = JSON.parse(document.querySelector('[data-json]').getAttribute('data-json'));
if (typeof json !== 'object') {
Olx.compile(i);
}
var type = document.querySelector('html').innerHTML.match(/an[úuÚú]ncio\s+profissional/ig);
var phone = this.detectPhones();
var email = this.detectEmails();
var data = {
name: unescape(decodeURIComponent(json.ad.user.name)),
phone: (json.ad.phone.phone) ? json.ad.phone.phone : phone,
email: email,
url: url,
type: (type && type.length) ? 'pro' : 'owner'
};
if (data.phone) {
this.data.rows.push(data);
}
Olx.compile(i);
},
detectPhones: function () {
this.debug('detect phones');
var a = document.querySelectorAll('a[href^=javascript],a[href="#"]');
if (!a || !a.length) {
return false;
}
for (var i in a) {
if (typeof a[i].click === 'function') {
a[i].click();
}
}
var c = document.querySelector('html').innerText.match(/\(\d\d\d?\)\s?\d\s?\d\d\d\d?[\.\- ]?\d\d\d\d\d?/ig);
if (c !== null && typeof c === 'object') {
c = Object.values(c);
}
return (c && c.length) ? c.filter(Olx.unique).join(',') : '';
},
detectEmails: function () {
this.debug('detect emails');
var c = document.querySelector('html').innerText.match(/\w+([\.-]?\w+)\@\w+([\.-]?\w+)*(\.\w{2,3})/ig);
if (c !== null && typeof c === 'object') {
c = Object.values(c);
c = c.map(function (a, b) {
return a.toLowerCase();
});
}
return (c && c.length) ? c.filter(Olx.unique).join(',') : '';
},
setCurrentPage: function () {
this.debug('set default page');
this.data.page = parseInt(this.path.replace(/.*o\=(\d+)/ig, '$1'));
this.data.page = (isNaN(this.data.page) || !this.data.page) ? 1 : this.data.page;
},
nextPage: function () {
this.debug('next page');
++this.data.page;
if (this.data.page > 100) {
return this.complete();
}
this.path = this.path.replace(/[\?]o\=\d+/, '') + '?o=' + parseInt(this.data.page);
this.debug('next page ' + this.data.page);
history.pushState({}, null, this.path);
this.ajax(this.path, function (b) {
document.body.innerHTML = b;
}, function () {
Olx.getLinks();
}, 'GET');
},
debug: function (a) {
if (this.settings.debug && typeof console === 'object') {
console.log(a);
}
},
ajax: function (url, callback, complete, method) {
complete = (typeof complete === 'function') ? complete : function () { };
this.b = new XMLHttpRequest();
this.b.open((method) ? method : "GET", url, true);
this.b.setRequestHeader('cache-control', 'no-cache');
this.b.setRequestHeader('content-type','text/html; charset=utf-8');
this.b.overrideMimeType('text/html; charset=UTF-8');
this.b.send();
this.b.onreadystatechange = function () {
if (Olx.b.readyState == 4 && Olx.b.status == 200 && typeof callback === 'function') {
Olx.debug('ajax callback');
callback(Olx.b.responseText) | setTimeout(complete, Olx.settings.speed);
}
}
},
export: function () {
this.debug('export rows');
this.data.rows.unshift({ name: 'name', phone: 'phone', email: 'email', url: 'url', type: 'type' });
var csv = this.data.rows.map(function (d) {
return Object.values(d).join(';');
}).join('\n').replace(/(^\[)|(\]$)/mg, '');
var b = document.createElement('a');
var universalBOM = "\uFEFF";
b.href = 'data:text/csv;charset=utf-8,' + encodeURIComponent(universalBOM + csv);
b.target = '_blank';
b.download = this.settings.filename;
b.click();
b.remove();
this.reset();
},
start: function () {
this.debug('start script');
this.setEvents();
},
reset: function () {
this.debug('reset');
this.data.rows = [];
},
resetLinks: function () {
this.debug('reset links');
this.data.links = [];
},
stop: function () {
this.debug('stop');
this.data.init = false;
},
complete: function () {
this.export();
this.debug('complete');
},
init: function () {
this.setCurrentPage();
if (this.settings.autostart) {
this.debug('start script');
this.setEvents();
} else {
this.debug('waiting..');
}
}
}).init();
@mamirad
Copy link

mamirad commented May 28, 2021

not working

@VSALDS
Copy link

VSALDS commented May 30, 2021

yes not working anymore, a file was download with a name called olx.csv but a blank file with blank rows, hope someone will fix

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment