Skip to content

Instantly share code, notes, and snippets.

@mashtullah
Last active April 27, 2020 03:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mashtullah/06eb91ecc59f9036ad2cdde339549221 to your computer and use it in GitHub Desktop.
Save mashtullah/06eb91ecc59f9036ad2cdde339549221 to your computer and use it in GitHub Desktop.
A node script to extract data from a pdf file
const fs = require('fs');
const pdf = require('pdf-parse');
let path=null; let pages=0;
const myArgs = process.argv;
if(myArgs.length<3)
{
console.log('No file supplied, exiting application now..');
return false;
}
else
{
path=myArgs[2];
var ext=path.split('.').pop();
if(!fs.existsSync(path)||ext!='pdf'){
console.log('File path supplied['+path+'] does not exist or is not a pdf, exiting application now..');
return false;
}
if(myArgs.length==4)
{
pages=parseInt( myArgs[3]);
}
}
console.log('File path supplied['+path+'] is OK! scanning for data, please wait...');
function render_page(pageData) {
let render_options = {
//replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
normalizeWhitespace: false,
//do not attempt to combine same line TextItem's. The default value is `false`.
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function(textContent) {
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY){
text += item.str;
}
else{
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return text;
});
}
let options = {
max: pages,// max pages to parse(0 means all)
pagerender: render_page
}
let dataBuffer = fs.readFileSync(path);//filepath
pdf(dataBuffer,options).then(function(data) {
console.log('Data scanning complete!,creating the json object, please wait...');
console.log(data.numrender+' pages read from a possible['+data.numpages+'] , please wait...');
let str=data.text;
let issuers=str.split("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");
var pdfData=[];
for(var i=1;i<issuers.length;i++)
{
var dt=issuers[i];
var lines=dt.split("\n");
var issuer=lines[1].split(":")[1];
var ticker=lines[2].split(":")[1].split(" ")[1];
var cusip=lines[2].split(":")[2];
var meetingDate=lines[3].split(":")[1].split(" ")[1];
var rows = {
issuer:issuer,ticker:ticker, meetingDate:meetingDate, cusip:cusip,
proposals: []
};
for(var p=5;p<lines.length;p++)
{
if(lines[p].length<10)
{
if(lines[p].replace(/\s/g,'').length>1)
rows.proposals[rows.proposals.length-1].title+=lines[p];
continue;
}
if(lines[p].substring(0,10)!="PROPOSAL #")
{
rows.proposals[rows.proposals.length-1].title+=lines[p];
continue;
}
var proposal={title: "", proposedBy: "", voted: true, voteCast: "", forAgainstMgmt: ""};
proposal.title=lines[p].split("ISSUER")[0];
proposal.proposedBy=lines[p].split(" ")[(lines[p].split(" ").length-5)];
proposal.voted=lines[p].split(" ")[(lines[p].split(" ").length-4)];
proposal.voteCast=lines[p].split(" ")[(lines[p].split(" ").length-3)];
proposal.forAgainstMgmt=lines[p].split(" ")[(lines[p].split(" ").length-2)];
rows.proposals.push(proposal);
}
pdfData.push(rows);
}
//Here your json variable pdfData contains all the data you wanted
var fName=path.split('.')[0]+'.json';
fs.writeFileSync(fName, JSON.stringify(pdfData));
console.log("Your PDF has been scanned and all the data written to a the file:["+fName+"]");
console.log("Bye for now!");
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment