Skip to content

Instantly share code, notes, and snippets.

@munichrocker
Last active September 15, 2020 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save munichrocker/b273e043db5c74e27f3bd73f6ccae9e9 to your computer and use it in GitHub Desktop.
Save munichrocker/b273e043db5c74e27f3bd73f6ccae9e9 to your computer and use it in GitHub Desktop.
Scrape RKI's Survstat
const puppeteer = require('puppeteer');
const fs = require('fs');
const unzipper = require('unzipper');
const path = "./survstat.zip"
// delete survstat.zip before each run
try {
fs.unlinkSync(path)
} catch(err) {
console.error("No file found!")
}
// Promise to check if file is downloaded before proceeding
function checkExistsWithTimeout(path, timeout = 20000) {
return new Promise((resolve, reject) => {
const timeoutTimerId = setTimeout(handleTimeout, timeout)
const interval = timeout / 6
let intervalTimerId
function handleTimeout() {
clearTimeout(timerId)
const error = new Error('path check timed out')
error.name = 'PATH_CHECK_TIMED_OUT'
reject(error)
}
function handleInterval() {
fs.access(path, (err) => {
if(err) {
intervalTimerId = setTimeout(handleInterval, interval)
} else {
clearTimeout(timeoutTimerId)
resolve(path)
}
})
}
intervalTimerId = setTimeout(handleInterval, interval)
})
}
// Start Scraping
(async () => {
const browser = await puppeteer.launch({headless: true}) // starting in headless mode
const page = await browser.newPage()
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: "./" }); // set download path here
const navigationPromise = page.waitForNavigation()
await page.goto('https://survstat.rki.de/Content/Query/Create.aspx')
await page.setViewport({ width: 1280, height: 640 })
await page.waitForSelector('td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen > .chosen-single > div > b')
await page.click('td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen')
// Select type of response: li[data-option-array-index="2"] means "Übermittlungskategorie" -> "Krankheit / Erreger"
const typeSelect = '#ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_DropDownListFilterHierarchy_1_chosen > div > ul > li[data-option-array-index="2"]'
await page.waitForSelector(typeSelect)
await page.click(typeSelect)
await navigationPromise
const diseaseSelect = 'tbody > tr > td > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_RepeaterFilterLevel_1_ListBoxFilterLevelMembers_0_chosen > .chosen-choices';
const diseaseSelectCovid = '#ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_RepeaterFilter_RepeaterFilterLevel_1_ListBoxFilterLevelMembers_0_chosen > div > ul > li'
await page.waitForSelector(diseaseSelect)
await page.click(diseaseSelect)
// Enter Name of Disease which will be typed in and selected
await page.type(diseaseSelect, "COVID-19")
await page.click(diseaseSelectCovid)
await navigationPromise
// Select type of response for Rows: li[data-option-array-index="21" means "in Zeilen" -> "Altersgruppierung: 5-Jahresintervalle"
const xFilter = '.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > div > ul > li[data-option-array-index="21"]'
await page.waitForSelector('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-single > div > b')
await page.click('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-single > div > b')
await page.waitForSelector('.partitionLeft > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListRowHierarchy_chosen > .chosen-drop > .chosen-results')
await page.click(xFilter)
await navigationPromise
// Select type of response for Columns: li[data-option-array-index="5" means "in Spalten" -> "Meldewoche"
const yFilter = '.partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > div > ul > li[data-option-array-index="5"]'
await page.waitForSelector('.teaser > .partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-single > span')
await page.click('.teaser > .partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-single > span')
await page.waitForSelector('.partitionRight > #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_DropDownListColHierarchy_chosen > .chosen-drop > .chosen-results > .active-result')
await page.click(yFilter)
await navigationPromise
// hit download
await page.waitForSelector('#wrapperContent > #content > #main #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_ButtonDownload')
await page.click('#wrapperContent > #content > #main #ContentPlaceHolderMain_ContentPlaceHolderAltGridFull_ButtonDownload')
await checkExistsWithTimeout(path) // once the file has been downloaded ...
await browser.close() // ... close Browser-Window
// unzip file
fs.createReadStream(path)
.pipe(unzipper.Extract({ path: './output' }));
})()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment