Skip to content

Instantly share code, notes, and snippets.

@phpenterprise
Last active May 20, 2023 19:19
Show Gist options
  • Save phpenterprise/6144bc35b4ba464aa38a0d2be7d84e35 to your computer and use it in GitHub Desktop.
Save phpenterprise/6144bc35b4ba464aa38a0d2be7d84e35 to your computer and use it in GitHub Desktop.
OLX Data Extractor (Scraping)
(Olx = {
release: '1.0.5 RC',
j: $,
path: window.location.href,
settings: {
debug: true,
url: window.location.href.replace(/\.(\w+)\/.*/, '.$1') + '/vi/{id}?rec=h',
filename: "olx.csv",
split: 100,
speed: 700,
autostart: true
},
data: {
init: true,
page: 1,
pages: [],
current: null,
links: [],
rows: [],
ads: []
},
setEvents: function () {
this.getLinks();
},
getLinks: function () {
this.debug('get links');
var a = document.querySelectorAll('a[href][data-lurker_list_id]');
if (!a.length) {
return this.complete();
}
for (var i in a) {
if (typeof a[i] === 'object') {
var id = a[i].getAttribute('data-lurker_list_id');
this.data.links.push(id);
}
if (i === "values") {
this.debug('links found ' + this.data.links.length);
this.data.links.filter(Olx.unique);
this.compile(0);
}
}
if (!this.data.links) {
return this.complete();
}
},
compile: function (i) {
this.debug('compile rows');
if (this.data.rows.length === this.settings.split) {
this.export();
}
if (i >= (Olx.data.links.length - 1)) {
this.resetLinks();
return this.nextPage();
} else {
this.time = setTimeout(function () {
Olx.getLink(Olx.data.links[i], parseInt(i) + 1);
}, this.settings.speed);
}
},
getLink: function (a, i) {
if (!this.data.init) {
return false;
}
if (this.data.ads.includes(a)) {
return setTimeout(function () {
Olx.compile(i);
}, this.settings.speed);
}
Olx.data.ads.push(a);
var url = this.settings.url.replace('{id}', a);
Olx.debug('page ' + this.data.page + ' | link ' + a + ' (' + (i - 1) + '/' + this.data.links.length + ') | rows collected (' + this.data.rows.length + '/' + this.settings.split + ')');
this.ajax(url, function (b) {
document.querySelector('HTML').innerHTML = b;
}, function () {
Olx.extract(url, i);
});
},
unique: function (value, index, self) {
return self.indexOf(value) === index;
},
extract: function (url, i) {
this.debug('extract data');
var json = JSON.parse(document.querySelector('[data-json]').getAttribute('data-json'));
if (typeof json !== 'object') {
Olx.compile(i);
}
var type = document.querySelector('html').innerHTML.match(/an[úuÚú]ncio\s+profissional/ig);
var phone = this.detectPhones();
var email = this.detectEmails();
var data = {
name: unescape(decodeURIComponent(json.ad.user.name)),
phone: (json.ad.phone.phone) ? json.ad.phone.phone : phone,
email: email,
url: url,
type: (type && type.length) ? 'pro' : 'owner'
};
if (data.phone) {
this.data.rows.push(data);
}
Olx.compile(i);
},
detectPhones: function () {
this.debug('detect phones');
var a = document.querySelectorAll('a[href^=javascript],a[href="#"]');
if (!a || !a.length) {
return false;
}
for (var i in a) {
if (typeof a[i].click === 'function') {
a[i].click();
}
}
var c = document.querySelector('html').innerText.match(/\(\d\d\d?\)\s?\d\s?\d\d\d\d?[\.\- ]?\d\d\d\d\d?/ig);
if (c !== null && typeof c === 'object') {
c = Object.values(c);
}
return (c && c.length) ? c.filter(Olx.unique).join(',') : '';
},
detectEmails: function () {
this.debug('detect emails');
var c = document.querySelector('html').innerText.match(/\w+([\.-]?\w+)\@\w+([\.-]?\w+)*(\.\w{2,3})/ig);
if (c !== null && typeof c === 'object') {
c = Object.values(c);
c = c.map(function (a, b) {
return a.toLowerCase();
});
}
return (c && c.length) ? c.filter(Olx.unique).join(',') : '';
},
setCurrentPage: function () {
this.debug('set default page');
this.data.page = parseInt(this.path.replace(/.*o\=(\d+)/ig, '$1'));
this.data.page = (isNaN(this.data.page) || !this.data.page) ? 1 : this.data.page;
},
nextPage: function () {
this.debug('next page');
++this.data.page;
if (this.data.page > 100) {
return this.complete();
}
this.path = this.path.replace(/[\?]o\=\d+/, '') + '?o=' + parseInt(this.data.page);
this.debug('next page ' + this.data.page);
history.pushState({}, null, this.path);
this.ajax(this.path, function (b) {
document.body.innerHTML = b;
}, function () {
Olx.getLinks();
}, 'GET');
},
debug: function (a) {
if (this.settings.debug && typeof console === 'object') {
console.log(a);
}
},
ajax: function (url, callback, complete, method) {
complete = (typeof complete === 'function') ? complete : function () { };
this.b = new XMLHttpRequest();
this.b.open((method) ? method : "GET", url, true);
this.b.setRequestHeader('cache-control', 'no-cache');
this.b.setRequestHeader('content-type','text/html; charset=utf-8');
this.b.overrideMimeType('text/html; charset=UTF-8');
this.b.send();
this.b.onreadystatechange = function () {
if (Olx.b.readyState == 4 && Olx.b.status == 200 && typeof callback === 'function') {
Olx.debug('ajax callback');
callback(Olx.b.responseText) | setTimeout(complete, Olx.settings.speed);
}
}
},
export: function () {
this.debug('export rows');
this.data.rows.unshift({ name: 'name', phone: 'phone', email: 'email', url: 'url', type: 'type' });
var csv = this.data.rows.map(function (d) {
return Object.values(d).join(';');
}).join('\n').replace(/(^\[)|(\]$)/mg, '');
var b = document.createElement('a');
var universalBOM = "\uFEFF";
b.href = 'data:text/csv;charset=utf-8,' + encodeURIComponent(universalBOM + csv);
b.target = '_blank';
b.download = this.settings.filename;
b.click();
b.remove();
this.reset();
},
start: function () {
this.debug('start script');
this.setEvents();
},
reset: function () {
this.debug('reset');
this.data.rows = [];
},
resetLinks: function () {
this.debug('reset links');
this.data.links = [];
},
stop: function () {
this.debug('stop');
this.data.init = false;
},
complete: function () {
this.export();
this.debug('complete');
},
init: function () {
this.setCurrentPage();
if (this.settings.autostart) {
this.debug('start script');
this.setEvents();
} else {
this.debug('waiting..');
}
}
}).init();
@phpenterprise
Copy link
Author

phpenterprise commented Mar 13, 2020

Extract Contacts From OLX (Scraper)

image

Analyzing numerous websites and portals throughout my career, I decided to develop a script in pure Javascript language to test information security in one of the portals that receive investment from millions every year.

Fact 1

Why don't they show the phone number and email on the page, but put it in the source code of the page in JSON. This is certainly not a good practice.

Fact 2

If bots can read all the content, what is the point of showing the full phone after the click? Either way, the data is exposed and there is no difficulty in reverse engineering the process.

How it works

Extract a list in CSV (excel) automatically with name, email, phone, type of account and links of the portal offers based on personalized search filters.

How to use

1. Open OLX website on your Browser
If possible, use Google Chrome as console commands are faster.

2. Log in to in your account
If not, create one account or use the social login buttons.

3. Filter your search objective
Search or filter using the categories, example: Vehicles, Masachusets state

4. Open your browser/navigator console
Press F12 (Brazilian keyboard), FN + F12 for American keyboard and Command + F12 on the MAC to open the console.

5. Copy and paste the script into the console
As shown in the image below, just press enter after pasting the code and let it run.

Anotação 2020-03-13 161459

If it appears as "links found" it is run correctly.

Settings

  • split: Total records per exported automatic file
  • speed: Speed of reading and extracting data (increase the value for slow networks, eg 2000)

Okay, I hope it helped you. Leave your comments if successful.

@mamirad
Copy link

mamirad commented May 28, 2021

not working

@VSALDS
Copy link

VSALDS commented May 30, 2021

yes not working anymore, a file was download with a name called olx.csv but a blank file with blank rows, hope someone will fix

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment