Skip to content

Instantly share code, notes, and snippets.

@slowjack2k
Last active May 12, 2018 11:43
Show Gist options
  • Save slowjack2k/55aa1b6a739ad838c319266bcedfd845 to your computer and use it in GitHub Desktop.
Save slowjack2k/55aa1b6a739ad838c319266bcedfd845 to your computer and use it in GitHub Desktop.
Quick & Dirty example of parsing a bank statement pdf (of a german bank) and converting it to csv
const getCategory = function(addresseeClient, eventReference, postingText){
if (/Auszahlung|Geldautomat|GAA /i.test(addresseeClient)){
return "cash";
}
return "none";
}
pdfs.forEach( function (pdfFilename) {
pdfParser.pdf2json(pdfFilename, function (error, pdf) {
if(error != null){
console.log(error);
}else{
pdf["pages"].forEach(function(page){
let data = initDataStruct();
page["texts"].forEach(function(text){
var posX = text["left"] ;
var textBlock = text["text"];
let type = undefined;
//console.log(posX + " " +textBlock);
if (posX >= 50 && posX < 80 && /\d\d\.\d\d\.\d{4}/.test(textBlock)){
type = "col1";
}
if (posX >= 80 && posX < 150){
type = "col2";
}
if (posX>= 150 && posX < 300){
type = "col3";
}
if (posX >= 300 && posX < 510){
type = "col4";
}
if (type){
data[type].push(textBlock);
}
if (posX > 510){
type = "col5";
if (data["col1"][0]){
let postingDate = data["col1"][0];
let valueDate = data["col1"][1] || "";
let eventReference = '"' + data["col2"].join(" ") + '"';
let addresseeClient = '"'+data["col3"].join(" ") + '"';
let postingText ='"' + data["col4"].join(" ") + '"';
let balance = textBlock.replace("+", '').replace('.', '').replace(",", '.');
let csvCols = [postingDate, valueDate , getCategory(addresseeClient, eventReference, postingText) ,eventReference , addresseeClient, postingText, balance];
console.log(csvCols.join(';'));
}
//console.log(data);
data = initDataStruct();
}
});
});
}
});
});
{
"name": "xxx",
"version": "1.0.0",
"main": "bank_statement_pdf_parser.js",
"license": "MIT",
"scripts": {
},
"dependencies": {
"glob": "^7.1.2",
"pdf-parser": "^1.0.4"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment