Skip to content

Instantly share code, notes, and snippets.

@Pazzilivo
Last active January 13, 2023 00:10
Show Gist options
  • Save Pazzilivo/302ef71ebbba9a5929c9942b96e5f152 to your computer and use it in GitHub Desktop.
Save Pazzilivo/302ef71ebbba9a5929c9942b96e5f152 to your computer and use it in GitHub Desktop.
const fs = require('fs');
const pdfParse = require('pdf-parse');
let content = []
const readPdf = async (uri) => {
const buffer = fs.readFileSync(uri);
try {
const data = await pdfParse(buffer);
content = data.text.split('\n')
console.log(readLine(content).join('\n'))
}catch(err){
throw new Error(err);
}
}
const readLine = (content) => {
let currentLine = 0
let total = content.length
let lines = []
let lastLineValid = false
while (currentLine < total) {
const regex = /(\d{4}-\d{2}-\d{2})(CNY)/g;
const valid = regex.test(content[currentLine])
if (valid) {
lines.push(content[currentLine])
lastLineValid = true
} else {
if (lastLineValid) {
if (content[currentLine]) {
const l = lines.length
lines[l - 1] = lines[l - 1] + content[currentLine]
} else {
lastLineValid = false
}
}
}
currentLine += 1
}
return lines.map((line) => {
const r = /(\d{4}-\d{2}-\d{2})(CNY)(-?[0-9,]*.\d{2})(-?[0-9,]*.\d{2})(\S*)/g
const part = r.exec(line)
if (part) {
const [, date, , amount, balance, payee] = part
return `${date},${amount},${balance},${payee}`
}
return []
})
}
// Testing
const DUMMY_PDF = './cmb.pdf';
readPdf(DUMMY_PDF);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment