Skip to content

Instantly share code, notes, and snippets.

@angww
Created December 22, 2018 05:44
Show Gist options
  • Save angww/0d6b648eb93e76b8b982377b1ad775f4 to your computer and use it in GitHub Desktop.
Save angww/0d6b648eb93e76b8b982377b1ad775f4 to your computer and use it in GitHub Desktop.
Web scrapper da rodoviária de Porto Alegre para arquivo CSV
const puppeteer = require('puppeteer');
const fs = require('fs');
const URL = 'http://rodoviaria-poa.com.br/institucional/php/hora.php';
/**** LIB FUNCTIONS */
const querySelectorFrom = (selector, elements) => {
const elementsArr = [...elements];
return [...document.querySelectorAll(selector)]
.filter(elm => elementsArr.includes(elm));
};
const funcNoImages = async (page) => {
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType() === 'image')
request.abort();
else
request.continue();
});
};
/* *** MAIN ************************************ */
async function main() {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
// To performance, no request images.
funcNoImages(page);
await page.setViewport({
width: 320,
height: 600
});
// On page console, LOG for me.
page.on('console', msg => console.log('PAGE LOG:', msg.text()));
// Abre o indice e espera
await page.goto(URL, {
waitUntil: 'networkidle0'
});
await page.waitForSelector("select[name='localdest']");
/* GET LIST OF DESTINOS */
let lista = await page.$$("select[name='localdest'] option");
console.log("TOTAL DESTINOS: " + lista.length);
var codDestinos = [];
// page.evalute executa DENTRO do NAVEGADOR, não no NODE
const listaDestinos = await page.evaluate(() => {
var lista = {};
document.querySelectorAll("select[name='localdest'] option").forEach(
function (v, i) {
// 'if' to LIMIT
if (i > 1 && i < 900) {
// Slice remove o \n do final de cada texto de cidade
lista[i] = {
'localdest': v.value,
'destNome': v.innerText.slice(0, -1)
};
}
});
return lista;
});
console.dir(listaDestinos);
// Aqui começa abrir página a pagina.
for (var i in listaDestinos) {
console.log(i + "--" + listaDestinos[i].localdest + "--" + listaDestinos[i].destNome);
await page.select("select[name='localdest']", listaDestinos[i].localdest);
await page.click("input[type='submit']");
await page.waitForSelector('body');
const result = await page.evaluate(() => {
try {
lista2 = {};
document.querySelectorAll("table td").forEach(function (v, i) {
// >8 is to dismiss the headers
if (i > 8) {
lista2[i] = {
'texto': v.innerText.replace(",",".")
};
}
});
return lista2;
} catch (err) {
reject(err.toString());
}
});
var linhacsv = "";
for (var j in result) {
if (linhacsv == "") { linhacsv = listaDestinos[i].destNome+", "; }
linhacsv = linhacsv +result[j].texto + ",";
if ((j+1) % 9 == 0) { fs.appendFileSync("horarios.csv", linhacsv + "\n"); linhacsv = ""; }
}
console.dir(result);
// Volta à página de busca para realizar o submit novamente
await page.goBack();
}
await browser.close();
process.exit();
}
main();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment