Created
December 22, 2018 05:44
-
-
Save angww/0d6b648eb93e76b8b982377b1ad775f4 to your computer and use it in GitHub Desktop.
Web scrapper da rodoviária de Porto Alegre para arquivo CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
const URL = 'http://rodoviaria-poa.com.br/institucional/php/hora.php'; | |
/**** LIB FUNCTIONS */ | |
const querySelectorFrom = (selector, elements) => { | |
const elementsArr = [...elements]; | |
return [...document.querySelectorAll(selector)] | |
.filter(elm => elementsArr.includes(elm)); | |
}; | |
const funcNoImages = async (page) => { | |
await page.setRequestInterception(true); | |
page.on('request', request => { | |
if (request.resourceType() === 'image') | |
request.abort(); | |
else | |
request.continue(); | |
}); | |
}; | |
/* *** MAIN ************************************ */ | |
async function main() { | |
const browser = await puppeteer.launch({ | |
headless: true, | |
args: ['--no-sandbox', '--disable-setuid-sandbox'] | |
}); | |
const page = await browser.newPage(); | |
// To performance, no request images. | |
funcNoImages(page); | |
await page.setViewport({ | |
width: 320, | |
height: 600 | |
}); | |
// On page console, LOG for me. | |
page.on('console', msg => console.log('PAGE LOG:', msg.text())); | |
// Abre o indice e espera | |
await page.goto(URL, { | |
waitUntil: 'networkidle0' | |
}); | |
await page.waitForSelector("select[name='localdest']"); | |
/* GET LIST OF DESTINOS */ | |
let lista = await page.$$("select[name='localdest'] option"); | |
console.log("TOTAL DESTINOS: " + lista.length); | |
var codDestinos = []; | |
// page.evalute executa DENTRO do NAVEGADOR, não no NODE | |
const listaDestinos = await page.evaluate(() => { | |
var lista = {}; | |
document.querySelectorAll("select[name='localdest'] option").forEach( | |
function (v, i) { | |
// 'if' to LIMIT | |
if (i > 1 && i < 900) { | |
// Slice remove o \n do final de cada texto de cidade | |
lista[i] = { | |
'localdest': v.value, | |
'destNome': v.innerText.slice(0, -1) | |
}; | |
} | |
}); | |
return lista; | |
}); | |
console.dir(listaDestinos); | |
// Aqui começa abrir página a pagina. | |
for (var i in listaDestinos) { | |
console.log(i + "--" + listaDestinos[i].localdest + "--" + listaDestinos[i].destNome); | |
await page.select("select[name='localdest']", listaDestinos[i].localdest); | |
await page.click("input[type='submit']"); | |
await page.waitForSelector('body'); | |
const result = await page.evaluate(() => { | |
try { | |
lista2 = {}; | |
document.querySelectorAll("table td").forEach(function (v, i) { | |
// >8 is to dismiss the headers | |
if (i > 8) { | |
lista2[i] = { | |
'texto': v.innerText.replace(",",".") | |
}; | |
} | |
}); | |
return lista2; | |
} catch (err) { | |
reject(err.toString()); | |
} | |
}); | |
var linhacsv = ""; | |
for (var j in result) { | |
if (linhacsv == "") { linhacsv = listaDestinos[i].destNome+", "; } | |
linhacsv = linhacsv +result[j].texto + ","; | |
if ((j+1) % 9 == 0) { fs.appendFileSync("horarios.csv", linhacsv + "\n"); linhacsv = ""; } | |
} | |
console.dir(result); | |
// Volta à página de busca para realizar o submit novamente | |
await page.goBack(); | |
} | |
await browser.close(); | |
process.exit(); | |
} | |
main(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment