anselm/gist:72906b2796c6d16621659fab00e5b25c

## gistfile1.txt

const fs = require("fs")
var pdf_table_extractor = require("pdf-table-extractor");
var fetch = require( 'node-fetch' );
var excel = require( 'write-excel-file/node' );

///////////////////////////////////////////////////////////////////////////////////////////////
// fetch things from net

let urlbase = "https://www.bppe.ca.gov"
let url = "https://www.bppe.ca.gov/webapps/summary_2018.php"

// helper

function getParams(uri) {
	var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g;
    while (tokens = re.exec(uri)) {
        params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]);
    }
    return params;
}

// fetch a doc from the site and save

async function savetodoc(url) {
	let time = Date.now()
	let path = getParams(url)["docName"]
	docid = path.match(/\d+/)
	path = "files/" + docid + ".pdf"
	if(fs.existsSync(path)) {
		console.log("already found as " + path)
		return [path,0]
	}
	console.log("saving as " + path)
	let response = await fetch(url)
	let blob = await response.arrayBuffer()
	fs.writeFileSync(path,Buffer.from(blob))
	console.log("time was " + (Date.now() - time) )
	return [path,blob]
}

// pull all useful urls from master manifest

async function geturls() {

	//fetch the doc - actually i just fetched it once and just saved it here
	//let response = await fetch(url)
	//let text = await response.text()
	//console.log(text)

	// just load the file
	let text = fs.readFileSync("summary_2018.html") + ""

	// find things that are like what we want
	let urls = text.match(/\bwebapps\/file_download.*?"/gi);
	let novel = {}
	let values = []
	let count = 0
	console.log("total is " + urls.length)
	for(let url of urls) {
		let musthave = "School Performance Fact Sheet"
		if(!url.includes(musthave)) continue
		//url = url.substring(0,url.length-musthave.length-2)
		url = url.substring(0,url.length-1)
		url = urlbase + "/" + url.trim().replace(/\s/g,"%20")
		if(novel[url]) continue
		novel[url]=url
		values.push(url)
	}
	return values
}

///////////////////////////////////////////////////////////////////////////////////////////////
// sometimes rows are messed up

let badrow = 0

Object.defineProperty(Array.prototype, 'last', {
    enumerable: false,
    configurable: true,
    get: function() {
        return this[this.length - 1];
    },
    set: undefined
});

let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ]

function workharder(row) {

	let out = []

	for(let i = 0; i < row.length; i++) {
		let term = row[i].trim().replace(/\s+/g,' ').trim()

		// Calendar Year ... ugh
		if(term.startsWith("Calendar Year ")) {
			out.push("Calendar Year")
			term = term.substring(13)
		}

		// things that are lower case are automatically not a start word
		if(term.charAt(0) != term.charAt(0).toUpperCase()) {
			out[out.length-1] = out[out.length-1] + " " + term
			continue
		}
		// also stop words are not start words
		let stopped = 0
		for(let bad of notstartwords) {
			if(term.startsWith(bad)) {
				out[out.length-1] = out[out.length-1] + " " + term
				stopped = 1
				break
			}
		}
		if(stopped) continue
		out.push(term)
	}

	return out
}

function fixuprow(row,numeric=0) {

	// sometimes everything is in the first field - so copy all over and examine

	let rows2 = []
	for(let field of row) {
		field = field.trim()
		if(!field.length) continue
		rows2.push(field)
	}

	// if there is more than one valid entry then this is ok
	// if there is only one thing then this is probably a bad row

	if(rows2.length > 1) { badrow = 0; return rows2 }

	if(rows2.length == 0) throw new Error("no data")

	badrow = 1

	if(numeric==0)
		{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) }
	else
		rows2 = rows2[0].split(' ')

//	console.log("*** had to fix row")
//	console.log(row)
//	console.log(rows2)

	return rows2
}

function speciallycheckfordupeto(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
		let field = row[i]
		let members = field.match(/to /g) || []
		if(members.length > 1) {
			let parts = field.split("\n")
			let first = parts[0] + " " + parts[1] + " " + parts[2]
			let second = parts[3] + " " + parts[4] + " " + parts[5]
			row[i] = first
			row.splice(i,0,second)
			//console.log("**** fixed bad two")
			//console.log(row)
			//console.log(parts)
		}
	}
}

function speciallycheckfordupeto2(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
		let field = row[i].trim()
		let parts = field.split(' ')
		if(parts.length > 1) {
			//console.log("**** bad num")
			//console.log(row)
			//console.log(parts)
			//console.log(field)
			row[i] = parts[0]
			row.splice(i,0,parts[1])
			//console.log(row)
		}
	}
}


let expectedsize = 3
let header = 0

function fixuprowheader(row) {
	row = fixuprow(row,0)
	header = row
	expectedsize = row.length

	speciallycheckfordupeto(row)

	return row
}

function fixuprow2(row) {
	row = fixuprow(row,1)
	if(expectedsize != row.length) {
		//console.error("**** bad row ***")
		//console.error(header)
		//console.error(row)
		throw Error("Row length error")
	}

	speciallycheckfordupeto2(row)

	return row
}

///////////////////////////////////////////////////////////////////////////////////////////////
//write stuff

let MINCOLS = 16
let docid = 0

//https://www.npmjs.com/package/write-excel-file
let excel_blob = []

function excel_new_doc() {
	excel_blob = []
}

function excel_write_related(row) {

	let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, '');

	let data = []
	data.push({value:str,fontWeight:"bold"})
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
}


function excel_write_header(row) {

	row = fixuprowheader(row)

	let data = []
	data.push({value:"SchoolID",fontWeight:"bold"})
	for(let field of row) {
		//field = field.replace(/[^a-zA-Z ]/g, "")
		field = field.replace(/[\n\r]+/g, '');
		if(!field.length) continue
		data.push({ value:field.trim(), fontWeight:'bold' })
	}
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
}

function excel_write_row(row) {

	row = fixuprow2(row)

	let data = []
	data.push({value:docid})
	for(let field of row) {
		if(!field.length) continue
		data.push({ value:field.trim() })
	}
	while(data.length< MINCOLS) data.push({value:""})
	excel_blob.push(data)
}

async function excel_save_doc(filename) {
	let stream = await excel(excel_blob)
	const output = fs.createWriteStream(filename)
	stream.pipe(output)
}

///////////////////////////////////////////////////////////////////////////////////////////////

// this is a list of tables that have unique header rows...
let uniques = {}

// chew through the pdf and think about it

async function chew(data) {
	let counter = 0
	let bucket = 0
	// visit every page
	for(let page of data.pageTables) {
		// visit every "row" aka "table" (they mistakenly refer to rows as 'tables')
		let rows = page.tables
		for(let r = 0; r < rows.length; r++) {
			// lets look at this row if it has anything fun in it
			let row = rows[r]
			if(!row.length) continue
			let isNumber = row[0].match(/^\d/)
			// first time we've seen a number? back up and write the header
			if(isNumber && bucket == 0) {
				bucket = rows[r-2].join(" ") + rows[r-1].join(" ")
				if(!uniques[bucket]) uniques[bucket] = []

				excel_write_related(rows[r-2])

				excel_write_header(rows[r-1])
			}
			// add numbers to a bucket
			if(isNumber) {
				uniques[bucket].push(row)
				excel_write_row(row)
			}
			// else flush the bucket
			else bucket = 0
		}
	}
}

///////////////////////////////////////////////////////////////////////////////////////////

function pdf_promise(path,callback) {
	return new Promise((a,b)=>{ pdf_table_extractor(path,a) })
}

///////////////////////////////////////////////////////////////////////////////////////////
// run

async function boot() {
	let urls = await geturls();
	for(let url of urls) {
		// get raw data as pdf
		console.log("loading " + url)
		let [path,text] = await savetodoc(url)
		// start a doc
		excel_new_doc()
		// chew on it
		let data = await pdf_promise(path)
		chew(data)
		console.log("**** done")
		// save it
		excel_save_doc("out"+path+".xlsx")
		// done
		break
	}
}

boot()

	const fs = require("fs")
	var pdf_table_extractor = require("pdf-table-extractor");
	var fetch = require( 'node-fetch' );
	var excel = require( 'write-excel-file/node' );

	///////////////////////////////////////////////////////////////////////////////////////////////
	// fetch things from net

	let urlbase = "https://www.bppe.ca.gov"
	let url = "https://www.bppe.ca.gov/webapps/summary_2018.php"

	// helper

	function getParams(uri) {
	var params = {}, tokens, re = /[?&]?([^=]+)=([^&]*)/g;
	while (tokens = re.exec(uri)) {
	params[decodeURIComponent(tokens[1])] = decodeURIComponent(tokens[2]);
	}
	return params;
	}

	// fetch a doc from the site and save

	async function savetodoc(url) {
	let time = Date.now()
	let path = getParams(url)["docName"]
	docid = path.match(/\d+/)
	path = "files/" + docid + ".pdf"
	if(fs.existsSync(path)) {
	console.log("already found as " + path)
	return [path,0]
	}
	console.log("saving as " + path)
	let response = await fetch(url)
	let blob = await response.arrayBuffer()
	fs.writeFileSync(path,Buffer.from(blob))
	console.log("time was " + (Date.now() - time) )
	return [path,blob]
	}

	// pull all useful urls from master manifest

	async function geturls() {

	//fetch the doc - actually i just fetched it once and just saved it here
	//let response = await fetch(url)
	//let text = await response.text()
	//console.log(text)

	// just load the file
	let text = fs.readFileSync("summary_2018.html") + ""

	// find things that are like what we want
	let urls = text.match(/\bwebapps\/file_download.*?"/gi);
	let novel = {}
	let values = []
	let count = 0
	console.log("total is " + urls.length)
	for(let url of urls) {
	let musthave = "School Performance Fact Sheet"
	if(!url.includes(musthave)) continue
	//url = url.substring(0,url.length-musthave.length-2)
	url = url.substring(0,url.length-1)
	url = urlbase + "/" + url.trim().replace(/\s/g,"%20")
	if(novel[url]) continue
	novel[url]=url
	values.push(url)
	}
	return values
	}

	///////////////////////////////////////////////////////////////////////////////////////////////
	// sometimes rows are messed up

	let badrow = 0

	Object.defineProperty(Array.prototype, 'last', {
	enumerable: false,
	configurable: true,
	get: function() {
	return this[this.length - 1];
	},
	set: undefined
	});

	let notstartwords = [ "Year","Employed","Field","Concurrent","Aggregated","Positions" ]

	function workharder(row) {

	let out = []

	for(let i = 0; i < row.length; i++) {
	let term = row[i].trim().replace(/\s+/g,' ').trim()

	// Calendar Year ... ugh
	if(term.startsWith("Calendar Year ")) {
	out.push("Calendar Year")
	term = term.substring(13)
	}

	// things that are lower case are automatically not a start word
	if(term.charAt(0) != term.charAt(0).toUpperCase()) {
	out[out.length-1] = out[out.length-1] + " " + term
	continue
	}
	// also stop words are not start words
	let stopped = 0
	for(let bad of notstartwords) {
	if(term.startsWith(bad)) {
	out[out.length-1] = out[out.length-1] + " " + term
	stopped = 1
	break
	}
	}
	if(stopped) continue
	out.push(term)
	}

	return out
	}

	function fixuprow(row,numeric=0) {

	// sometimes everything is in the first field - so copy all over and examine

	let rows2 = []
	for(let field of row) {
	field = field.trim()
	if(!field.length) continue
	rows2.push(field)
	}

	// if there is more than one valid entry then this is ok
	// if there is only one thing then this is probably a bad row

	if(rows2.length > 1) { badrow = 0; return rows2 }

	if(rows2.length == 0) throw new Error("no data")

	badrow = 1

	if(numeric==0)
	{ rows2 = rows2[0].split(/\r?\n/); rows2 = workharder(rows2) }
	else
	rows2 = rows2[0].split(' ')

	// console.log("*** had to fix row")
	// console.log(row)
	// console.log(rows2)

	return rows2
	}

	function speciallycheckfordupeto(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
	let field = row[i]
	let members = field.match(/to /g) \|\| []
	if(members.length > 1) {
	let parts = field.split("\n")
	let first = parts[0] + " " + parts[1] + " " + parts[2]
	let second = parts[3] + " " + parts[4] + " " + parts[5]
	row[i] = first
	row.splice(i,0,second)
	//console.log("**** fixed bad two")
	//console.log(row)
	//console.log(parts)
	}
	}
	}

	function speciallycheckfordupeto2(row) {
	if(row.length < 2) return
	for(let i = 0; i < row.length ; i++) {
	let field = row[i].trim()
	let parts = field.split(' ')
	if(parts.length > 1) {
	//console.log("**** bad num")
	//console.log(row)
	//console.log(parts)
	//console.log(field)
	row[i] = parts[0]
	row.splice(i,0,parts[1])
	//console.log(row)
	}
	}
	}


	let expectedsize = 3
	let header = 0

	function fixuprowheader(row) {
	row = fixuprow(row,0)
	header = row
	expectedsize = row.length

	speciallycheckfordupeto(row)

	return row
	}

	function fixuprow2(row) {
	row = fixuprow(row,1)
	if(expectedsize != row.length) {
	//console.error("** bad row *")
	//console.error(header)
	//console.error(row)
	throw Error("Row length error")
	}

	speciallycheckfordupeto2(row)

	return row
	}

	///////////////////////////////////////////////////////////////////////////////////////////////
	//write stuff

	let MINCOLS = 16
	let docid = 0

	//https://www.npmjs.com/package/write-excel-file
	let excel_blob = []

	function excel_new_doc() {
	excel_blob = []
	}

	function excel_write_related(row) {

	let str = "SchooldID: " + docid + " Info: " + row.join(" ").replace(/[\n\r]+/g, '');

	let data = []
	data.push({value:str,fontWeight:"bold"})
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
	}


	function excel_write_header(row) {

	row = fixuprowheader(row)

	let data = []
	data.push({value:"SchoolID",fontWeight:"bold"})
	for(let field of row) {
	//field = field.replace(/[^a-zA-Z ]/g, "")
	field = field.replace(/[\n\r]+/g, '');
	if(!field.length) continue
	data.push({ value:field.trim(), fontWeight:'bold' })
	}
	while(data.length< MINCOLS) data.push({value:""})

	excel_blob.push(data)
	}

	function excel_write_row(row) {

	row = fixuprow2(row)

	let data = []
	data.push({value:docid})
	for(let field of row) {
	if(!field.length) continue
	data.push({ value:field.trim() })
	}
	while(data.length< MINCOLS) data.push({value:""})
	excel_blob.push(data)
	}

	async function excel_save_doc(filename) {
	let stream = await excel(excel_blob)
	const output = fs.createWriteStream(filename)
	stream.pipe(output)
	}

	///////////////////////////////////////////////////////////////////////////////////////////////

	// this is a list of tables that have unique header rows...
	let uniques = {}

	// chew through the pdf and think about it

	async function chew(data) {
	let counter = 0
	let bucket = 0
	// visit every page
	for(let page of data.pageTables) {
	// visit every "row" aka "table" (they mistakenly refer to rows as 'tables')
	let rows = page.tables
	for(let r = 0; r < rows.length; r++) {
	// lets look at this row if it has anything fun in it
	let row = rows[r]
	if(!row.length) continue
	let isNumber = row[0].match(/^\d/)
	// first time we've seen a number? back up and write the header
	if(isNumber && bucket == 0) {
	bucket = rows[r-2].join(" ") + rows[r-1].join(" ")
	if(!uniques[bucket]) uniques[bucket] = []

	excel_write_related(rows[r-2])

	excel_write_header(rows[r-1])
	}
	// add numbers to a bucket
	if(isNumber) {
	uniques[bucket].push(row)
	excel_write_row(row)
	}
	// else flush the bucket
	else bucket = 0
	}
	}
	}

	///////////////////////////////////////////////////////////////////////////////////////////

	function pdf_promise(path,callback) {
	return new Promise((a,b)=>{ pdf_table_extractor(path,a) })
	}

	///////////////////////////////////////////////////////////////////////////////////////////
	// run

	async function boot() {
	let urls = await geturls();
	for(let url of urls) {
	// get raw data as pdf
	console.log("loading " + url)
	let [path,text] = await savetodoc(url)
	// start a doc
	excel_new_doc()
	// chew on it
	let data = await pdf_promise(path)
	chew(data)
	console.log("**** done")
	// save it
	excel_save_doc("out"+path+".xlsx")
	// done
	break
	}
	}

	boot()