-
-
Save jplattel/032067d7469adad6cbe773d1a6cf1af3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const puppeteer = require('puppeteer-core'); | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const https = require('https'); | |
| const http = require('http'); | |
| const pubs = require('./all_publications.json'); | |
| const BASE = 'https://www.uwv.nl'; | |
| const OUTPUT_DIR = path.join(__dirname, 'downloads'); | |
| // Parse year and slug from URL like /nl/wet-open-overheid/woo-publicaties/2026/slug-here | |
| function parseUrl(url) { | |
| const parts = url.split('/'); | |
| const yearIdx = parts.findIndex(p => /^\d{4}$/.test(p)); | |
| const year = parts[yearIdx]; | |
| const slug = parts.slice(yearIdx + 1).join('/'); | |
| return { year, slug }; | |
| } | |
| function downloadFile(url, dest) { | |
| return new Promise((resolve, reject) => { | |
| const dir = path.dirname(dest); | |
| fs.mkdirSync(dir, { recursive: true }); | |
| if (fs.existsSync(dest)) { | |
| console.log(` SKIP (exists): ${path.basename(dest)}`); | |
| return resolve(); | |
| } | |
| const proto = url.startsWith('https') ? https : http; | |
| const request = (reqUrl) => { | |
| proto.get(reqUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } }, (res) => { | |
| if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) { | |
| // Follow redirect | |
| const redirect = res.headers.location.startsWith('http') | |
| ? res.headers.location | |
| : BASE + res.headers.location; | |
| downloadFile(redirect, dest).then(resolve).catch(reject); | |
| return; | |
| } | |
| if (res.statusCode !== 200) { | |
| console.log(` FAIL (${res.statusCode}): ${url}`); | |
| return resolve(); | |
| } | |
| const file = fs.createWriteStream(dest); | |
| res.pipe(file); | |
| file.on('finish', () => { file.close(); resolve(); }); | |
| file.on('error', reject); | |
| }).on('error', reject); | |
| }; | |
| request(url); | |
| }); | |
| } | |
| async function sleep(ms) { | |
| return new Promise(r => setTimeout(r, ms)); | |
| } | |
| (async () => { | |
| const browser = await puppeteer.connect({ browserWSEndpoint: 'ws://localhost:3000' }); | |
| // Process publications with concurrency limit | |
| const CONCURRENCY = 3; | |
| let idx = 0; | |
| let downloaded = 0; | |
| let skipped = 0; | |
| let errors = 0; | |
| async function processPublication(pub) { | |
| const { year, slug } = parseUrl(pub.url); | |
| const pubDir = path.join(OUTPUT_DIR, year, slug); | |
| const fullUrl = pub.url.startsWith('http') ? pub.url : BASE + pub.url; | |
| if (fs.existsSync(pubDir)) { | |
| console.log(`[${pub.id}/${pubs.length}] ${slug} - SKIP (dir exists)`); | |
| skipped++; | |
| return; | |
| } | |
| const page = await browser.newPage(); | |
| try { | |
| await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 60000 }); | |
| // Extract only document links (pdf, doc, xls, zip icons) — skip navigation/footer links | |
| const DOCUMENT_ICONS = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'csv', 'xml', 'file', 'download']; | |
| const fileLinks = await page.evaluate((icons) => { | |
| const links = document.querySelectorAll('bgl-list-link[link-url]'); | |
| return Array.from(links) | |
| .filter(el => { | |
| const icon = (el.getAttribute('icon-name') || '').toLowerCase(); | |
| const url = (el.getAttribute('link-url') || '').toLowerCase(); | |
| // Include if icon matches a document type, or URL points to a file | |
| return icons.includes(icon) || /\.(pdf|docx?|xlsx?|zip|csv|xml)(\?|$)/i.test(url); | |
| }) | |
| .map(el => ({ | |
| url: el.getAttribute('link-url'), | |
| label: el.getAttribute('link-label') || '' | |
| })); | |
| }, DOCUMENT_ICONS); | |
| if (fileLinks.length === 0) { | |
| console.log(`[${pub.id}/${pubs.length}] ${slug} - no files`); | |
| skipped++; | |
| return; | |
| } | |
| console.log(`[${pub.id}/${pubs.length}] ${slug} - ${fileLinks.length} file(s)`); | |
| for (const file of fileLinks) { | |
| let fileUrl = file.url; | |
| if (fileUrl.startsWith('/')) { | |
| fileUrl = BASE + fileUrl; | |
| } | |
| // Derive filename from URL | |
| const filename = decodeURIComponent(path.basename(fileUrl.split('?')[0])); | |
| const lower = filename.toLowerCase(); | |
| const labelLower = file.label.toLowerCase(); | |
| // Only download files containing "besluit", "beslissing" or "verzoek" in the filename or label | |
| if (!(lower.includes('besluit') || lower.includes('beslissing') || lower.includes('verzoek') || | |
| labelLower.includes('besluit') || labelLower.includes('beslissing') || labelLower.includes('verzoek'))) { | |
| // console.log(` SKIP (filter): ${filename}`); | |
| skipped++; | |
| continue; | |
| } | |
| const dest = path.join(pubDir, filename); | |
| try { | |
| await downloadFile(fileUrl, dest); | |
| console.log(` OK: ${filename}`); | |
| downloaded++; | |
| } catch (e) { | |
| console.log(` ERR: ${filename} - ${e.message}`); | |
| errors++; | |
| } | |
| } | |
| } catch (e) { | |
| console.log(`[${pub.id}/${pubs.length}] ${slug} - page error: ${e.message}`); | |
| errors++; | |
| } finally { | |
| await page.close(); | |
| } | |
| } | |
| // Run with concurrency | |
| async function runBatch() { | |
| while (idx < pubs.length) { | |
| const batch = []; | |
| for (let i = 0; i < CONCURRENCY && idx < pubs.length; i++) { | |
| batch.push(processPublication(pubs[idx])); | |
| idx++; | |
| } | |
| await Promise.all(batch); | |
| await sleep(500); // Small delay between batches | |
| } | |
| } | |
| console.log(`Starting download of ${pubs.length} publications...`); | |
| await runBatch(); | |
| console.log(`\nDone! Downloaded: ${downloaded}, Skipped: ${skipped}, Errors: ${errors}`); | |
| browser.disconnect(); | |
| })(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment