Skip to content

Instantly share code, notes, and snippets.

@erochest
Last active December 28, 2015 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save erochest/7539962 to your computer and use it in GitHub Desktop.
Save erochest/7539962 to your computer and use it in GitHub Desktop.
A script for scraping enrollment race data from the MD Report Card site.
#!/usr/bin/env python
# Dependencies:
# - pip install lxml
# - pip install cssselect
# - pip install requests
import collections
import csv
import re
import sys
from urlparse import parse_qs, urljoin, urlparse
import requests
import lxml.etree
import lxml.html
ROOT = 'http://www.mdreportcard.org/rcounty.aspx?WDATA=School'
### Data types
RACE_FIELDS = [
'am_ind', 'asian', 'afam', 'hispanic', 'hi', 'white', 'multi',
]
SCHOOL_FIELDS = (
['county', 'county_code', 'name', 'code', 'year', 'total'] + RACE_FIELDS
)
YEARS = {'2011', '2012', '2013'}
InfoLink = collections.namedtuple('InfoLink', ['info', 'href'])
SchoolInfo = collections.namedtuple('SchoolInfo', SCHOOL_FIELDS)
def get_page(url):
"""Download the page using GET and return the document."""
r = requests.get(url)
return lxml.html.document_fromstring(r.text)
def get_text_of(parent, css, f, sep=''):
"""\
This gets the elements for the CSS selector, calls f to get the text, and
joins it all together.
"""
return sep.join(
f(node) for node in parent.cssselect(css)
).strip()
def get_schools(url=ROOT):
"""This walks the root page and returns NamedLinks for each school."""
root = get_page(url)
for td in root.cssselect('.countySchoolTable td'):
county_name = get_text_of(td, '.countyBox', lambda n: n.text)
link = get_text_of(td, '.demographicsBox a', lambda n: n.get('href'))
county_url = urljoin(url, link)
county_page = get_page(county_url)
for a in county_page.cssselect('div.schoolBox a'):
info = SchoolInfo(
county_name,
None,
a.text.strip(),
re.search(r'\d+', a.tail).group(),
None,
None,
None,
None,
None,
None,
None,
None,
None,
)
school_url = urljoin(county_url, a.get('href'))
yield InfoLink(info, school_url)
def load_info(info_link, years):
"""\
This takes an unpopulated InfoLink, gets the page and updates the data.
"""
index = get_page(info_link.href)
for a in index.cssselect('a'):
if a.text and a.text.strip() == 'Enrollment':
info = info_link.info
url = urljoin(info_link.href, a.get('href'))
parts = urlparse(url)
query = parse_qs(parts.query)
pv = query.get('PV')
if pv is None:
sys.stderr.write('Invalid URL: {}\n'.format(url))
continue
county_code = pv[0].split(':')[2]
year_infos = get_enrollment(
url, info._replace(county_code=county_code), years,
)
for year_info in year_infos:
yield year_info
def get_enrollment(url, info, years):
"""\
This loads all of the enrollment data. The URL is the main enrollment URL
linked to from the School's site.
"""
page = get_page(url)
totals = {}
for table in page.cssselect('table.datatable'):
totals.update(read_enrollment(table))
race_totals = {}
for ul in page.cssselect('#raceList'):
for a in ul.cssselect('li a'):
if a.text and a.text.strip().startswith('All Races'):
race_url = urljoin(url, a.get('href'))
race_totals = load_race_totals(get_page(race_url))
break
for year in years:
if year in totals:
total = totals[year]
rtotals = race_totals[year]
yield info._replace(
year = year,
total = total,
am_ind = rtotals.get('am_ind'),
asian = rtotals.get('asian'),
afam = rtotals.get('afam'),
hispanic = rtotals.get('hispanic'),
hi = rtotals.get('hi'),
white = rtotals.get('white'),
multi = rtotals.get('multi'),
)
def read_enrollment(table):
"""\
This pulls the data for all years from the enrollment table and returns it
as a dict.
"""
totals = {}
for tbody in table.cssselect('tbody'):
for node in tbody.cssselect('tr.year th.label'):
year = node.text
for node in tbody.cssselect('tr td'):
if node.text and node.text.strip().isdigit():
totals[year] = int(node.text)
break
return totals
def load_race_totals(page):
totals = {}
for table in page.cssselect('table.datatable'):
year = None
for tr in table.cssselect('tr'):
if tr.get('class') == 'year':
year = tr.cssselect('th')[0].text
continue
if year is None:
continue
tds = [
td
for td in tr.cssselect('td')
if td.get('class') != 'label'
]
totals[year] = {
k: v.text
for (k, v) in zip(RACE_FIELDS, tds)
}
return totals
def main():
writer = csv.writer(sys.stdout, dialect='excel-tab')
writer.writerow(SCHOOL_FIELDS)
for info in get_schools():
writer.writerows( year_data for year_data in load_info(info, YEARS) )
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment