Skip to content

Instantly share code, notes, and snippets.

@jplattel
Created April 8, 2026 20:47
Show Gist options
  • Select an option

  • Save jplattel/032067d7469adad6cbe773d1a6cf1af3 to your computer and use it in GitHub Desktop.

Select an option

Save jplattel/032067d7469adad6cbe773d1a6cf1af3 to your computer and use it in GitHub Desktop.
const puppeteer = require('puppeteer-core');
const fs = require('fs');
const path = require('path');
const https = require('https');
const http = require('http');
const pubs = require('./all_publications.json');
const BASE = 'https://www.uwv.nl';
const OUTPUT_DIR = path.join(__dirname, 'downloads');
// Parse year and slug from URL like /nl/wet-open-overheid/woo-publicaties/2026/slug-here
function parseUrl(url) {
const parts = url.split('/');
const yearIdx = parts.findIndex(p => /^\d{4}$/.test(p));
const year = parts[yearIdx];
const slug = parts.slice(yearIdx + 1).join('/');
return { year, slug };
}
function downloadFile(url, dest) {
return new Promise((resolve, reject) => {
const dir = path.dirname(dest);
fs.mkdirSync(dir, { recursive: true });
if (fs.existsSync(dest)) {
console.log(` SKIP (exists): ${path.basename(dest)}`);
return resolve();
}
const proto = url.startsWith('https') ? https : http;
const request = (reqUrl) => {
proto.get(reqUrl, { headers: { 'User-Agent': 'Mozilla/5.0' } }, (res) => {
if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
// Follow redirect
const redirect = res.headers.location.startsWith('http')
? res.headers.location
: BASE + res.headers.location;
downloadFile(redirect, dest).then(resolve).catch(reject);
return;
}
if (res.statusCode !== 200) {
console.log(` FAIL (${res.statusCode}): ${url}`);
return resolve();
}
const file = fs.createWriteStream(dest);
res.pipe(file);
file.on('finish', () => { file.close(); resolve(); });
file.on('error', reject);
}).on('error', reject);
};
request(url);
});
}
async function sleep(ms) {
return new Promise(r => setTimeout(r, ms));
}
(async () => {
const browser = await puppeteer.connect({ browserWSEndpoint: 'ws://localhost:3000' });
// Process publications with concurrency limit
const CONCURRENCY = 3;
let idx = 0;
let downloaded = 0;
let skipped = 0;
let errors = 0;
async function processPublication(pub) {
const { year, slug } = parseUrl(pub.url);
const pubDir = path.join(OUTPUT_DIR, year, slug);
const fullUrl = pub.url.startsWith('http') ? pub.url : BASE + pub.url;
if (fs.existsSync(pubDir)) {
console.log(`[${pub.id}/${pubs.length}] ${slug} - SKIP (dir exists)`);
skipped++;
return;
}
const page = await browser.newPage();
try {
await page.goto(fullUrl, { waitUntil: 'networkidle2', timeout: 60000 });
// Extract only document links (pdf, doc, xls, zip icons) — skip navigation/footer links
const DOCUMENT_ICONS = ['pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'csv', 'xml', 'file', 'download'];
const fileLinks = await page.evaluate((icons) => {
const links = document.querySelectorAll('bgl-list-link[link-url]');
return Array.from(links)
.filter(el => {
const icon = (el.getAttribute('icon-name') || '').toLowerCase();
const url = (el.getAttribute('link-url') || '').toLowerCase();
// Include if icon matches a document type, or URL points to a file
return icons.includes(icon) || /\.(pdf|docx?|xlsx?|zip|csv|xml)(\?|$)/i.test(url);
})
.map(el => ({
url: el.getAttribute('link-url'),
label: el.getAttribute('link-label') || ''
}));
}, DOCUMENT_ICONS);
if (fileLinks.length === 0) {
console.log(`[${pub.id}/${pubs.length}] ${slug} - no files`);
skipped++;
return;
}
console.log(`[${pub.id}/${pubs.length}] ${slug} - ${fileLinks.length} file(s)`);
for (const file of fileLinks) {
let fileUrl = file.url;
if (fileUrl.startsWith('/')) {
fileUrl = BASE + fileUrl;
}
// Derive filename from URL
const filename = decodeURIComponent(path.basename(fileUrl.split('?')[0]));
const lower = filename.toLowerCase();
const labelLower = file.label.toLowerCase();
// Only download files containing "besluit", "beslissing" or "verzoek" in the filename or label
if (!(lower.includes('besluit') || lower.includes('beslissing') || lower.includes('verzoek') ||
labelLower.includes('besluit') || labelLower.includes('beslissing') || labelLower.includes('verzoek'))) {
// console.log(` SKIP (filter): ${filename}`);
skipped++;
continue;
}
const dest = path.join(pubDir, filename);
try {
await downloadFile(fileUrl, dest);
console.log(` OK: ${filename}`);
downloaded++;
} catch (e) {
console.log(` ERR: ${filename} - ${e.message}`);
errors++;
}
}
} catch (e) {
console.log(`[${pub.id}/${pubs.length}] ${slug} - page error: ${e.message}`);
errors++;
} finally {
await page.close();
}
}
// Run with concurrency
async function runBatch() {
while (idx < pubs.length) {
const batch = [];
for (let i = 0; i < CONCURRENCY && idx < pubs.length; i++) {
batch.push(processPublication(pubs[idx]));
idx++;
}
await Promise.all(batch);
await sleep(500); // Small delay between batches
}
}
console.log(`Starting download of ${pubs.length} publications...`);
await runBatch();
console.log(`\nDone! Downloaded: ${downloaded}, Skipped: ${skipped}, Errors: ${errors}`);
browser.disconnect();
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment