Skip to content

Instantly share code, notes, and snippets.

@jplattel
Created April 8, 2026 20:49
Show Gist options
  • Select an option

  • Save jplattel/bb738d1ced3ca4c34b72b3be8a70ddd7 to your computer and use it in GitHub Desktop.

Select an option

Save jplattel/bb738d1ced3ca4c34b72b3be8a70ddd7 to your computer and use it in GitHub Desktop.
const fs = require('fs');
const path = require('path');
const pdfParse = require('pdf-parse');
// Suppress pdf.js font warnings
const originalWarn = console.warn;
console.warn = (...args) => {
if (typeof args[0] === 'string' && args[0].includes('Ran out of space in font')) return;
originalWarn.apply(console, args);
};
const DOWNLOADS_DIR = path.join(__dirname, 'downloads');
const DUTCH_MONTHS = {
'januari': '01', 'februari': '02', 'maart': '03', 'april': '04',
'mei': '05', 'juni': '06', 'juli': '07', 'augustus': '08',
'september': '09', 'oktober': '10', 'november': '11', 'december': '12'
};
const monthPattern = Object.keys(DUTCH_MONTHS).join('|');
const dateRegex = new RegExp(`\\b(\\d{1,2})\\s+(${monthPattern})\\s+(\\d{4})\\b`, 'gi');
function parseDutchDate(day, month, year) {
const mm = DUTCH_MONTHS[month.toLowerCase()];
return `${year}-${mm}-${day.padStart(2, '0')}`;
}
function extractDatesFromText(text) {
const dates = [];
let match;
dateRegex.lastIndex = 0;
while ((match = dateRegex.exec(text)) !== null) {
dates.push({
raw: match[0],
iso: parseDutchDate(match[1], match[2], match[3])
});
}
const seen = new Set();
return dates.filter(d => {
if (seen.has(d.iso)) return false;
seen.add(d.iso);
return true;
});
}
function findDecisionDate(text) {
// The decision date is typically the first date found in the document
dateRegex.lastIndex = 0;
const match = dateRegex.exec(text);
if (match) {
return {
raw: match[0],
iso: parseDutchDate(match[1], match[2], match[3])
};
}
return null;
}
function findPdfs(dir) {
const results = [];
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
const full = path.join(dir, entry.name);
if (entry.isDirectory()) {
results.push(...findPdfs(full));
} else if (entry.name.toLowerCase().endsWith('.pdf')) {
results.push(full);
}
}
return results;
}
(async () => {
const pdfs = findPdfs(DOWNLOADS_DIR);
console.log(`Found ${pdfs.length} PDFs`);
const results = [];
let processed = 0;
for (const pdf of pdfs) {
const relative = path.relative(DOWNLOADS_DIR, pdf);
try {
const buffer = fs.readFileSync(pdf);
const data = await pdfParse(buffer);
const text = data.text || '';
const decisionDate = findDecisionDate(text);
const dates = extractDatesFromText(text);
results.push({
file: relative,
decisionDate,
dates
});
processed++;
if (processed % 20 === 0) {
console.log(` ${processed}/${pdfs.length}...`);
}
} catch (e) {
console.log(` ERR: ${relative} - ${e.message}`);
results.push({ file: relative, decisionDate: null, dates: [], error: e.message });
}
}
const output = path.join(__dirname, 'extracted_dates.json');
fs.writeFileSync(output, JSON.stringify(results, null, 2));
console.log(`\nDone! ${processed} PDFs processed. Output: extracted_dates.json`);
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment