mashtullah/index.js

## index.js
const fs = require('fs');
const pdf = require('pdf-parse');
let path=null; let pages=0;


const myArgs = process.argv;
if(myArgs.length<3)
{
	console.log('No file supplied, exiting application now..');
	return false;
}
else
{
	path=myArgs[2];
	var ext=path.split('.').pop();

	if(!fs.existsSync(path)||ext!='pdf'){
		console.log('File path supplied['+path+'] does not exist or is not a pdf, exiting application now..');
		return false;
	}
	if(myArgs.length==4)
	{
			pages=parseInt( myArgs[3]);
	}
}
 console.log('File path supplied['+path+'] is OK! scanning for data, please wait...');

function render_page(pageData) {

    let render_options = {
        //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
        normalizeWhitespace: false,
        //do not attempt to combine same line TextItem's. The default value is `false`.
        disableCombineTextItems: false
    }

    return pageData.getTextContent(render_options)
    .then(function(textContent) {
        let lastY, text = '';
        for (let item of textContent.items) {
            if (lastY == item.transform[5] || !lastY){
                text += item.str;
            }
            else{
                text += '\n' + item.str;
            }
            lastY = item.transform[5];
        }
        return text;
    });
}
let options = {
    max: pages,// max pages to parse(0 means all)
    pagerender: render_page
}
let dataBuffer = fs.readFileSync(path);//filepath

pdf(dataBuffer,options).then(function(data) {
	console.log('Data scanning complete!,creating the json object, please wait...');
	console.log(data.numrender+' pages read from a possible['+data.numpages+'] , please wait...');
	let str=data.text;
	let issuers=str.split("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");


	var pdfData=[];

	for(var i=1;i<issuers.length;i++)
	{
		var dt=issuers[i];
		var lines=dt.split("\n");
		var issuer=lines[1].split(":")[1];
		var ticker=lines[2].split(":")[1].split(" ")[1];
		var cusip=lines[2].split(":")[2];
		var meetingDate=lines[3].split(":")[1].split(" ")[1];

		var rows = {
			issuer:issuer,ticker:ticker, meetingDate:meetingDate, cusip:cusip,
			proposals: []
		};
		for(var p=5;p<lines.length;p++)
		{
			if(lines[p].length<10)
			{
				if(lines[p].replace(/\s/g,'').length>1)
					rows.proposals[rows.proposals.length-1].title+=lines[p];
				continue;
			}
			if(lines[p].substring(0,10)!="PROPOSAL #")
			{
				rows.proposals[rows.proposals.length-1].title+=lines[p];
				continue;
			}
			var proposal={title: "", proposedBy: "", voted: true, voteCast: "", forAgainstMgmt: ""};
			proposal.title=lines[p].split("ISSUER")[0];
			proposal.proposedBy=lines[p].split(" ")[(lines[p].split(" ").length-5)];
			proposal.voted=lines[p].split(" ")[(lines[p].split(" ").length-4)];
			proposal.voteCast=lines[p].split(" ")[(lines[p].split(" ").length-3)];
			proposal.forAgainstMgmt=lines[p].split(" ")[(lines[p].split(" ").length-2)];

			rows.proposals.push(proposal);
		}
		pdfData.push(rows);
	}
	//Here your json variable pdfData contains all the data you wanted
	var fName=path.split('.')[0]+'.json';
	fs.writeFileSync(fName, JSON.stringify(pdfData));
	console.log("Your PDF has been scanned and all the data written to a the file:["+fName+"]");
    console.log("Bye for now!");
});
	const fs = require('fs');
	const pdf = require('pdf-parse');
	let path=null; let pages=0;


	const myArgs = process.argv;
	if(myArgs.length<3)
	{
	console.log('No file supplied, exiting application now..');
	return false;
	}
	else
	{
	path=myArgs[2];
	var ext=path.split('.').pop();

	if(!fs.existsSync(path)\|\|ext!='pdf'){
	console.log('File path supplied['+path+'] does not exist or is not a pdf, exiting application now..');
	return false;
	}
	if(myArgs.length==4)
	{
	pages=parseInt( myArgs[3]);
	}
	}
	console.log('File path supplied['+path+'] is OK! scanning for data, please wait...');

	function render_page(pageData) {

	let render_options = {
	//replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
	normalizeWhitespace: false,
	//do not attempt to combine same line TextItem's. The default value is `false`.
	disableCombineTextItems: false
	}

	return pageData.getTextContent(render_options)
	.then(function(textContent) {
	let lastY, text = '';
	for (let item of textContent.items) {
	if (lastY == item.transform[5] \|\| !lastY){
	text += item.str;
	}
	else{
	text += '\n' + item.str;
	}
	lastY = item.transform[5];
	}
	return text;
	});
	}
	let options = {
	max: pages,// max pages to parse(0 means all)
	pagerender: render_page
	}
	let dataBuffer = fs.readFileSync(path);//filepath

	pdf(dataBuffer,options).then(function(data) {
	console.log('Data scanning complete!,creating the json object, please wait...');
	console.log(data.numrender+' pages read from a possible['+data.numpages+'] , please wait...');
	let str=data.text;
	let issuers=str.split("---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------");


	var pdfData=[];

	for(var i=1;i<issuers.length;i++)
	{
	var dt=issuers[i];
	var lines=dt.split("\n");
	var issuer=lines[1].split(":")[1];
	var ticker=lines[2].split(":")[1].split(" ")[1];
	var cusip=lines[2].split(":")[2];
	var meetingDate=lines[3].split(":")[1].split(" ")[1];

	var rows = {
	issuer:issuer,ticker:ticker, meetingDate:meetingDate, cusip:cusip,
	proposals: []
	};
	for(var p=5;p<lines.length;p++)
	{
	if(lines[p].length<10)
	{
	if(lines[p].replace(/\s/g,'').length>1)
	rows.proposals[rows.proposals.length-1].title+=lines[p];
	continue;
	}
	if(lines[p].substring(0,10)!="PROPOSAL #")
	{
	rows.proposals[rows.proposals.length-1].title+=lines[p];
	continue;
	}
	var proposal={title: "", proposedBy: "", voted: true, voteCast: "", forAgainstMgmt: ""};
	proposal.title=lines[p].split("ISSUER")[0];
	proposal.proposedBy=lines[p].split(" ")[(lines[p].split(" ").length-5)];
	proposal.voted=lines[p].split(" ")[(lines[p].split(" ").length-4)];
	proposal.voteCast=lines[p].split(" ")[(lines[p].split(" ").length-3)];
	proposal.forAgainstMgmt=lines[p].split(" ")[(lines[p].split(" ").length-2)];

	rows.proposals.push(proposal);
	}
	pdfData.push(rows);
	}
	//Here your json variable pdfData contains all the data you wanted
	var fName=path.split('.')[0]+'.json';
	fs.writeFileSync(fName, JSON.stringify(pdfData));
	console.log("Your PDF has been scanned and all the data written to a the file:["+fName+"]");
	console.log("Bye for now!");
	});