erochest/get_data.py

## get_data.py
#!/usr/bin/env python


# Dependencies:
# - pip install lxml
# - pip install cssselect
# - pip install requests

import collections
import csv
import re
import sys
from urlparse import parse_qs, urljoin, urlparse

import requests
import lxml.etree
import lxml.html


ROOT = 'http://www.mdreportcard.org/rcounty.aspx?WDATA=School'


### Data types
RACE_FIELDS   = [
        'am_ind', 'asian', 'afam', 'hispanic', 'hi', 'white', 'multi',
        ]
SCHOOL_FIELDS = (
        ['county', 'county_code', 'name', 'code', 'year', 'total'] + RACE_FIELDS
        )

YEARS = {'2011', '2012', '2013'}
InfoLink   = collections.namedtuple('InfoLink', ['info', 'href'])
SchoolInfo = collections.namedtuple('SchoolInfo', SCHOOL_FIELDS)


def get_page(url):
    """Download the page using GET and return the document."""
    r = requests.get(url)
    return lxml.html.document_fromstring(r.text)


def get_text_of(parent, css, f, sep=''):
    """\
    This gets the elements for the CSS selector, calls f to get the text, and
    joins it all together.

    """
    return sep.join(
            f(node) for node in parent.cssselect(css)
            ).strip()


def get_schools(url=ROOT):
    """This walks the root page and returns NamedLinks for each school."""
    root = get_page(url)
    for td in root.cssselect('.countySchoolTable td'):
        county_name = get_text_of(td, '.countyBox', lambda n: n.text)
        link        = get_text_of(td, '.demographicsBox a', lambda n: n.get('href'))
        county_url  = urljoin(url, link)
        county_page = get_page(county_url)
        for a in county_page.cssselect('div.schoolBox a'):
            info = SchoolInfo(
                    county_name,
                    None,
                    a.text.strip(),
                    re.search(r'\d+', a.tail).group(),
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    )
            school_url = urljoin(county_url, a.get('href'))
            yield InfoLink(info, school_url)


def load_info(info_link, years):
    """\
    This takes an unpopulated InfoLink, gets the page and updates the data.
    """
    index = get_page(info_link.href)

    for a in index.cssselect('a'):
        if a.text and a.text.strip() == 'Enrollment':
            info  = info_link.info
            url   = urljoin(info_link.href, a.get('href'))
            parts = urlparse(url)
            query = parse_qs(parts.query)
            pv    = query.get('PV')

            if pv is None:
                sys.stderr.write('Invalid URL: {}\n'.format(url))
                continue

            county_code = pv[0].split(':')[2]
            year_infos = get_enrollment(
                    url, info._replace(county_code=county_code), years,
                    )
            for year_info in year_infos:
                yield year_info


def get_enrollment(url, info, years):
    """\
    This loads all of the enrollment data. The URL is the main enrollment URL
    linked to from the School's site.

    """
    page   = get_page(url)

    totals = {}
    for table in page.cssselect('table.datatable'):
        totals.update(read_enrollment(table))

    race_totals = {}
    for ul in page.cssselect('#raceList'):
        for a in ul.cssselect('li a'):
            if a.text and a.text.strip().startswith('All Races'):
                race_url    = urljoin(url, a.get('href'))
                race_totals = load_race_totals(get_page(race_url))
                break

    for year in years:
        if year in totals:
            total   = totals[year]
            rtotals = race_totals[year]
            yield info._replace(
                    year     = year,
                    total    = total,
                    am_ind   = rtotals.get('am_ind'),
                    asian    = rtotals.get('asian'),
                    afam     = rtotals.get('afam'),
                    hispanic = rtotals.get('hispanic'),
                    hi       = rtotals.get('hi'),
                    white    = rtotals.get('white'),
                    multi    = rtotals.get('multi'),
                    )


def read_enrollment(table):
    """\
    This pulls the data for all years from the enrollment table and returns it
    as a dict.

    """

    totals = {}
    for tbody in table.cssselect('tbody'):
        for node in tbody.cssselect('tr.year th.label'):
            year = node.text
        for node in tbody.cssselect('tr td'):
            if node.text and node.text.strip().isdigit():
                totals[year] = int(node.text)
                break
    return totals


def load_race_totals(page):
    totals = {}

    for table in page.cssselect('table.datatable'):
        year = None
        for tr in table.cssselect('tr'):
            if tr.get('class') == 'year':
                year = tr.cssselect('th')[0].text
                continue
            if year is None:
                continue
            tds = [
                    td
                    for td in tr.cssselect('td')
                    if td.get('class') != 'label'
                    ]
            totals[year] = {
                    k: v.text
                    for (k, v) in zip(RACE_FIELDS, tds)
                    }

    return totals


def main():
    writer = csv.writer(sys.stdout, dialect='excel-tab')
    writer.writerow(SCHOOL_FIELDS)
    for info in get_schools():
        writer.writerows( year_data for year_data in load_info(info, YEARS) )


if __name__ == '__main__':
    main()
	#!/usr/bin/env python


	# Dependencies:
	# - pip install lxml
	# - pip install cssselect
	# - pip install requests

	import collections
	import csv
	import re
	import sys
	from urlparse import parse_qs, urljoin, urlparse

	import requests
	import lxml.etree
	import lxml.html


	ROOT = 'http://www.mdreportcard.org/rcounty.aspx?WDATA=School'


	### Data types
	RACE_FIELDS = [
	'am_ind', 'asian', 'afam', 'hispanic', 'hi', 'white', 'multi',
	]
	SCHOOL_FIELDS = (
	['county', 'county_code', 'name', 'code', 'year', 'total'] + RACE_FIELDS
	)

	YEARS = {'2011', '2012', '2013'}
	InfoLink = collections.namedtuple('InfoLink', ['info', 'href'])
	SchoolInfo = collections.namedtuple('SchoolInfo', SCHOOL_FIELDS)


	def get_page(url):
	"""Download the page using GET and return the document."""
	r = requests.get(url)
	return lxml.html.document_fromstring(r.text)


	def get_text_of(parent, css, f, sep=''):
	"""\
	This gets the elements for the CSS selector, calls f to get the text, and
	joins it all together.

	"""
	return sep.join(
	f(node) for node in parent.cssselect(css)
	).strip()


	def get_schools(url=ROOT):
	"""This walks the root page and returns NamedLinks for each school."""
	root = get_page(url)
	for td in root.cssselect('.countySchoolTable td'):
	county_name = get_text_of(td, '.countyBox', lambda n: n.text)
	link = get_text_of(td, '.demographicsBox a', lambda n: n.get('href'))
	county_url = urljoin(url, link)
	county_page = get_page(county_url)
	for a in county_page.cssselect('div.schoolBox a'):
	info = SchoolInfo(
	county_name,
	None,
	a.text.strip(),
	re.search(r'\d+', a.tail).group(),
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	None,
	)
	school_url = urljoin(county_url, a.get('href'))
	yield InfoLink(info, school_url)


	def load_info(info_link, years):
	"""\
	This takes an unpopulated InfoLink, gets the page and updates the data.
	"""
	index = get_page(info_link.href)

	for a in index.cssselect('a'):
	if a.text and a.text.strip() == 'Enrollment':
	info = info_link.info
	url = urljoin(info_link.href, a.get('href'))
	parts = urlparse(url)
	query = parse_qs(parts.query)
	pv = query.get('PV')

	if pv is None:
	sys.stderr.write('Invalid URL: {}\n'.format(url))
	continue

	county_code = pv[0].split(':')[2]
	year_infos = get_enrollment(
	url, info._replace(county_code=county_code), years,
	)
	for year_info in year_infos:
	yield year_info


	def get_enrollment(url, info, years):
	"""\
	This loads all of the enrollment data. The URL is the main enrollment URL
	linked to from the School's site.

	"""
	page = get_page(url)

	totals = {}
	for table in page.cssselect('table.datatable'):
	totals.update(read_enrollment(table))

	race_totals = {}
	for ul in page.cssselect('#raceList'):
	for a in ul.cssselect('li a'):
	if a.text and a.text.strip().startswith('All Races'):
	race_url = urljoin(url, a.get('href'))
	race_totals = load_race_totals(get_page(race_url))
	break

	for year in years:
	if year in totals:
	total = totals[year]
	rtotals = race_totals[year]
	yield info._replace(
	year = year,
	total = total,
	am_ind = rtotals.get('am_ind'),
	asian = rtotals.get('asian'),
	afam = rtotals.get('afam'),
	hispanic = rtotals.get('hispanic'),
	hi = rtotals.get('hi'),
	white = rtotals.get('white'),
	multi = rtotals.get('multi'),
	)


	def read_enrollment(table):
	"""\
	This pulls the data for all years from the enrollment table and returns it
	as a dict.

	"""

	totals = {}
	for tbody in table.cssselect('tbody'):
	for node in tbody.cssselect('tr.year th.label'):
	year = node.text
	for node in tbody.cssselect('tr td'):
	if node.text and node.text.strip().isdigit():
	totals[year] = int(node.text)
	break
	return totals


	def load_race_totals(page):
	totals = {}

	for table in page.cssselect('table.datatable'):
	year = None
	for tr in table.cssselect('tr'):
	if tr.get('class') == 'year':
	year = tr.cssselect('th')[0].text
	continue
	if year is None:
	continue
	tds = [
	td
	for td in tr.cssselect('td')
	if td.get('class') != 'label'
	]
	totals[year] = {
	k: v.text
	for (k, v) in zip(RACE_FIELDS, tds)
	}

	return totals


	def main():
	writer = csv.writer(sys.stdout, dialect='excel-tab')
	writer.writerow(SCHOOL_FIELDS)
	for info in get_schools():
	writer.writerows( year_data for year_data in load_info(info, YEARS) )


	if __name__ == '__main__':
	main()