Created
December 17, 2015 08:54
-
-
Save jdesboeufs/b14ca07f89636706d0c0 to your computer and use it in GitHub Desktop.
Prototype parsing avis IR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var PDFParser = require('pdf2json'); | |
var _ = require('lodash'); | |
var _s = require('underscore.string'); | |
var parser = new PDFParser(); | |
var startTime = Date.now(); | |
parser.on("pdfParser_dataReady", function() { | |
var endOfParsing = Date.now(); | |
var entries = this.data.Pages[0].Texts.map(function(textEntry) { | |
var content = textEntry.R.map(function(singleTextEntry) { | |
return decodeURIComponent(singleTextEntry.T); | |
}).join(''); | |
return { x: textEntry.x, y: textEntry.y, content: content }; | |
}); | |
var yGroupedEntries = _.groupBy(entries, 'y'); | |
var xGroupedEntries = _.groupBy(entries, 'x'); | |
var contentIndexedEntries = _.indexBy(entries, 'content'); | |
console.log(entries); | |
function ensureTextEntry(textEntry) { | |
return _.isString(textEntry) ? contentIndexedEntries[textEntry] : textEntry; | |
} | |
function topOf(that, reference) { | |
reference = ensureTextEntry(reference); | |
return that.y < reference.y; | |
} | |
function bottomOf(that, reference) { | |
reference = ensureTextEntry(reference); | |
return that.y > reference.y; | |
} | |
function rightOf(that, reference) { | |
reference = ensureTextEntry(reference); | |
return that.x > reference.x; | |
} | |
function leftOf(that, reference) { | |
reference = ensureTextEntry(reference); | |
return that.x < reference.x; | |
} | |
function sameLineRight(that, tolerance) { | |
var reference = ensureTextEntry(that); | |
var xLine; | |
tolerance = tolerance || 0; | |
if (tolerance > 0) { | |
xLine = _.flatten(_.map(yGroupedEntries, function(yGroup, y) { | |
return Math.abs(y - reference.y) <= tolerance ? yGroup : []; | |
})); | |
} else { | |
xLine = yGroupedEntries[reference.y]; | |
} | |
var xSortedLine = _.sortBy(xLine, 'x'); | |
return _.filter(xSortedLine, function(item) { | |
return rightOf(item, reference); | |
}); | |
} | |
function sameLineImmediateRight(that, tolerance) { | |
return sameLineRight(that, tolerance)[0]; | |
} | |
var data = { declarants: {} }; | |
_.filter(entries, function(entry) { | |
return bottomOf(entry, 'Numéro fiscal :') && | |
topOf(entry, 'Référence de l\'avis :') && | |
_s.startsWith(entry.content, 'Déclarant'); | |
}).forEach(function(declarant) { | |
data.declarants[declarant.content] = sameLineImmediateRight(declarant).content.split(' ').join(''); | |
}); | |
data.referenceAvis = sameLineImmediateRight('Référence de l\'avis :', 0.1).content.split(' ').join(''); | |
data.traitementsSalaires = sameLineRight('Traitements, salaires', 0.1).forEach(function(valeur) { | |
}); | |
console.log(data); | |
var endOfProcessing = Date.now(); | |
console.log('Parsing: ', endOfParsing - startTime, 'ms'); | |
console.log('Processing: ', endOfProcessing - endOfParsing, 'ms'); | |
}); | |
parser.on("pdfParser_dataError", function() { | |
console.log('An error occurred!'); | |
console.log(this); | |
}); | |
parser.loadPDF(__dirname + '/IR-Justif-91-2013-14922554377149.pdf'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment