-
-
Save jplattel/bb738d1ced3ca4c34b72b3be8a70ddd7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const fs = require('fs'); | |
| const path = require('path'); | |
| const pdfParse = require('pdf-parse'); | |
| // Suppress pdf.js font warnings | |
| const originalWarn = console.warn; | |
| console.warn = (...args) => { | |
| if (typeof args[0] === 'string' && args[0].includes('Ran out of space in font')) return; | |
| originalWarn.apply(console, args); | |
| }; | |
| const DOWNLOADS_DIR = path.join(__dirname, 'downloads'); | |
| const DUTCH_MONTHS = { | |
| 'januari': '01', 'februari': '02', 'maart': '03', 'april': '04', | |
| 'mei': '05', 'juni': '06', 'juli': '07', 'augustus': '08', | |
| 'september': '09', 'oktober': '10', 'november': '11', 'december': '12' | |
| }; | |
| const monthPattern = Object.keys(DUTCH_MONTHS).join('|'); | |
| const dateRegex = new RegExp(`\\b(\\d{1,2})\\s+(${monthPattern})\\s+(\\d{4})\\b`, 'gi'); | |
| function parseDutchDate(day, month, year) { | |
| const mm = DUTCH_MONTHS[month.toLowerCase()]; | |
| return `${year}-${mm}-${day.padStart(2, '0')}`; | |
| } | |
| function extractDatesFromText(text) { | |
| const dates = []; | |
| let match; | |
| dateRegex.lastIndex = 0; | |
| while ((match = dateRegex.exec(text)) !== null) { | |
| dates.push({ | |
| raw: match[0], | |
| iso: parseDutchDate(match[1], match[2], match[3]) | |
| }); | |
| } | |
| const seen = new Set(); | |
| return dates.filter(d => { | |
| if (seen.has(d.iso)) return false; | |
| seen.add(d.iso); | |
| return true; | |
| }); | |
| } | |
| function findDecisionDate(text) { | |
| // The decision date is typically the first date found in the document | |
| dateRegex.lastIndex = 0; | |
| const match = dateRegex.exec(text); | |
| if (match) { | |
| return { | |
| raw: match[0], | |
| iso: parseDutchDate(match[1], match[2], match[3]) | |
| }; | |
| } | |
| return null; | |
| } | |
| function findPdfs(dir) { | |
| const results = []; | |
| for (const entry of fs.readdirSync(dir, { withFileTypes: true })) { | |
| const full = path.join(dir, entry.name); | |
| if (entry.isDirectory()) { | |
| results.push(...findPdfs(full)); | |
| } else if (entry.name.toLowerCase().endsWith('.pdf')) { | |
| results.push(full); | |
| } | |
| } | |
| return results; | |
| } | |
| (async () => { | |
| const pdfs = findPdfs(DOWNLOADS_DIR); | |
| console.log(`Found ${pdfs.length} PDFs`); | |
| const results = []; | |
| let processed = 0; | |
| for (const pdf of pdfs) { | |
| const relative = path.relative(DOWNLOADS_DIR, pdf); | |
| try { | |
| const buffer = fs.readFileSync(pdf); | |
| const data = await pdfParse(buffer); | |
| const text = data.text || ''; | |
| const decisionDate = findDecisionDate(text); | |
| const dates = extractDatesFromText(text); | |
| results.push({ | |
| file: relative, | |
| decisionDate, | |
| dates | |
| }); | |
| processed++; | |
| if (processed % 20 === 0) { | |
| console.log(` ${processed}/${pdfs.length}...`); | |
| } | |
| } catch (e) { | |
| console.log(` ERR: ${relative} - ${e.message}`); | |
| results.push({ file: relative, decisionDate: null, dates: [], error: e.message }); | |
| } | |
| } | |
| const output = path.join(__dirname, 'extracted_dates.json'); | |
| fs.writeFileSync(output, JSON.stringify(results, null, 2)); | |
| console.log(`\nDone! ${processed} PDFs processed. Output: extracted_dates.json`); | |
| })(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment