Last active
September 15, 2020 14:27
-
-
Save munichrocker/b273e043db5c74e27f3bd73f6ccae9e9 to your computer and use it in GitHub Desktop.
Scrape RKI's Survstat
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const puppeteer = require('puppeteer'); | |
const fs = require('fs'); | |
const unzipper = require('unzipper'); | |
const path = "./survstat.zip" | |
// delete survstat.zip before each run | |
try { | |
fs.unlinkSync(path) | |
} catch(err) { | |
console.error("No file found!") | |
} | |
// Promise to check if file is downloaded before proceeding | |
function checkExistsWithTimeout(path, timeout = 20000) { | |
return new Promise((resolve, reject) => { | |
const timeoutTimerId = setTimeout(handleTimeout, timeout) | |
const interval = timeout / 6 | |
let intervalTimerId | |
function handleTimeout() { | |
clearTimeout(timerId) | |
const error = new Error('path check timed out') | |
error.name = 'PATH_CHECK_TIMED_OUT' | |
reject(error) | |
} | |
function handleInterval() { | |
fs.access(path, (err) => { | |
if(err) { | |
intervalTimerId = setTimeout(handleInterval, interval) | |
} else { | |
clearTimeout(timeoutTimerId) | |
resolve(path) | |
} | |
}) | |
} | |
intervalTimerId = setTimeout(handleInterval, interval) | |
}) | |
} | |
// Start Scraping | |
(async () => { | |
const browser = await puppeteer.launch({headless: true}) // starting in headless mode | |
const page = await browser.newPage() | |
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: "./" }); // set download path here | |
const navigationPromise = page.waitForNavigation() | |
await page.goto('https://survstat.rki.de/Content/Query/Create.aspx') | |
await page.setViewport({ width: 1280, height: 640 }) | |
await page.waitForSelector('td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen > .chosen-single > div > b') | |
await page.click('td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen') | |
// Select type of response: li[data-option-array-index="2"] means "Übermittlungskategorie" -> "Krankheit / Erreger" | |
const typeSelect = '#ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen > div > ul > li[data-option-array-index="2"]' | |
await page.waitForSelector(typeSelect) | |
await page.click(typeSelect) | |
await navigationPromise | |
const diseaseSelect = 'tbody > tr > td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_RepeaterFilterLevel_1_ListBoxFilterLevelMembers_0_chosen > .chosen-choices'; | |
const diseaseSelectCovid = '#ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_RepeaterFilterLevel_1_ListBoxFilterLevelMembers_0_chosen > div > ul > li' | |
await page.waitForSelector(diseaseSelect) | |
await page.click(diseaseSelect) | |
// Enter Name of Disease which will be typed in and selected | |
await page.type(diseaseSelect, "COVID-19") | |
await page.click(diseaseSelectCovid) | |
await navigationPromise | |
// Select type of response for Rows: li[data-option-array-index="21" means "in Zeilen" -> "Altersgruppierung: 5-Jahresintervalle" | |
const xFilter = '.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > div > ul > li[data-option-array-index="21"]' | |
await page.waitForSelector('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-single > div > b') | |
await page.click('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-single > div > b') | |
await page.waitForSelector('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-drop > .chosen-results') | |
await page.click(xFilter) | |
await navigationPromise | |
// Select type of response for Columns: li[data-option-array-index="5" means "in Spalten" -> "Meldewoche" | |
const yFilter = '.partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > div > ul > li[data-option-array-index="5"]' | |
await page.waitForSelector('.teaser > .partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-single > span') | |
await page.click('.teaser > .partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-single > span') | |
await page.waitForSelector('.partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-drop > .chosen-results > .active-result') | |
await page.click(yFilter) | |
await navigationPromise | |
// hit download | |
await page.waitForSelector('#wrapperContent > #content > #main #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_ButtonDownload') | |
await page.click('#wrapperContent > #content > #main #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_ButtonDownload') | |
await checkExistsWithTimeout(path) // once the file has been downloaded ... | |
await browser.close() // ... close Browser-Window | |
// unzip file | |
fs.createReadStream(path) | |
.pipe(unzipper.Extract({ path: './output' })); | |
})() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment