Skip to content

Instantly share code, notes, and snippets.

@thomaswilburn
Last active October 4, 2022 18:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save thomaswilburn/a54b691498184ca90c17367a3abba709 to your computer and use it in GitHub Desktop.
Save thomaswilburn/a54b691498184ca90c17367a3abba709 to your computer and use it in GitHub Desktop.
SBER scraper
var fetch = require("node-fetch");
var cheerio = require("cheerio");
var fs = require("fs");
const SBER = "https://www.nycenet.edu/offices/d_chanc_oper/budget/dbor/sber/FY2018/FY2018_District_Overview.aspx";
// ASP pages have a number of hidden inputs that are required for page submission
// so we'll grab those from the index
async function getASPValues(target) {
var response = await fetch(target);
var html = await response.text();
var $ = cheerio.load(html);
var data = {};
var inputs = $(`input[type="hidden"]`);
for (var i of inputs) {
data[i.attribs.name] = i.attribs.value || "";
}
return data;
}
const sberFormValues = {
_EVENTTARGET: "ctl00$ContentPlaceHolder1$Input_District",
ctl00$ContentPlaceHolder1$reportnumber: 1,
ctl00$Fiscal_Year: "SELECT_A_YEAR"
};
const headers = {
"User-Agent": "Mozilla/5.0",
"Content-Type": "application/x-www-form-urlencoded",
"Cookie": "dtCookie=v_4_srv_1_sn_292A33D36F8C33E20553E8219F651152_perc_100000_ol_0_mul_1_app-3Aa02a80c0651cea0c_0"
};
function formEncode(data) {
var entries = Object.entries(data);
var form = entries.map(([k, v]) => {
var encoded = encodeURIComponent(v);
return `${k}=${encoded}`;
}).join("&");
return form;
}
var getSBER = async function(asp, district) {
var data = {
...asp,
...sberFormValues,
ctl00$ContentPlaceHolder1$Input_District: String(district).padStart(2, "0")
};
// generate a form-encoded POST body
var body = formEncode(data);
var response = await fetch(SBER, {
method: "POST",
headers,
body
});
var html = await response.text();
var $ = cheerio.load(html);
var rows = $(".CSD_HS_Detail");
var scraped = [];
for (var row of rows) {
var { children } = row;
var cells = children.map(c => $(c).text());
scraped.push(cells);
}
return scraped;
}
var scrapeSBER = async function() {
var aspData = await getASPValues(SBER);
var boundGetSBER = getSBER.bind(null, aspData);
var output = [];
for (var i = 1; i <= 32; i++) {
console.log(`Scraping school data for District #${i}...`);
var rows = await boundGetSBER(i);
output.push(...rows);
}
var csv = output.map(l => l.map(c => `"${c.trim()}"`).join(",")).join("\n");
fs.writeFileSync("sber.csv", csv);
}
scrapeSBER();
from bs4 import BeautifulSoup
import requests
import csv
SBER_ENDPOINT = "https://www.nycenet.edu/offices/d_chanc_oper/budget/dbor/sber/FY2018/FY2018_District_Overview.aspx"
def getASPValues(soup):
inputs = soup.select("input[type=hidden]")
asp = {}
for element in inputs:
name = element.get("name")
value = element.get("value")
asp[name] = value
return asp
def extractData(soup):
rows = soup.select(".CSD_HS_Detail")
extracted = []
for r in rows:
data = [cell.text for cell in r.children]
extracted.append(data)
return extracted
index = requests.get(SBER_ENDPOINT)
dom = BeautifulSoup(index.text, "html.parser")
session = getASPValues(dom)
# list of districts - just a few for demo purposes
districts = ["01", "02"]
sberForm = dict([
("_EVENTTARGET", "ctl00$ContentPlaceHolder1$Input_District"),
("ctl00$ContentPlaceHolder1$reportnumber", 1),
("ctl00$Fiscal_Year", "SELECT_A_YEAR")
])
with open("python_scrape.csv", "w+") as output:
writer = csv.writer(output)
for district in districts:
form = {**session, **sberForm}
form["ctl00$ContentPlaceHolder1$Input_District"] = district
response = requests.post(SBER_ENDPOINT, data=form)
parsed = BeautifulSoup(response.text, "html.parser")
scraped = extractData(parsed)
for row in scraped:
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment