bshillingford/scrape_all.py

## scrape_all.py
#!/usr/bin/python2.7
import lxml, lxml.etree, urllib2, re

SESSYR=2012
SESSCD='W'

debug_last_url = None

def read_endpoint(dept=None, course=None, section=None):
    global SESSYR, SESSCD
    global debug_last_url

    url_prefix = "https://courses.students.ubc.ca/cs/servlets/SRVCourseSchedule?sessyr=%d&sesscd=%s&" % (SESSYR, SESSCD)
    if dept is None:
        url = url_prefix + "output=2&req=0"
    elif course is None:
        url = url_prefix + "output=5&req=2&dept=%s" % dept
    elif section is None:
        url = url_prefix + "output=5&req=4&dept=%s&course=%s" % (dept, course)
    else:
        raise Exception("not implemented")
    debug_last_url = url # DEBUG
    result = urllib2.urlopen(url).read().decode('utf8', 'ignore')

    # workaround for anthropology text bugs
    result = re.sub("[\x00-\x08\x0b\x0c\x0e-\x1f]", "", result)

    return lxml.etree.XML(result)

def scrape_depts():
    for elem_dept in read_endpoint():
        yield dict(elem_dept.items())

def scrape_courses(dept):
    if type(dept) == dict:
        dept = dept['key']

    for elem_course in read_endpoint(dept):
        yield dict(elem_course.items())

def scrape_sections(dept, course):
    if type(dept) == dict:
        dept = dept['key']
    if type(course) == dict:
        course = course['key']

    for elem_section in read_endpoint(dept, course):
        section = dict(elem_section.items())
        teachingunits = []
        for elem_teachingunit in elem_section.find("teachingunits"):
            teachingunit = dict(elem_teachingunit.items())

            elem_meetings = elem_teachingunit.find("meetings")
            if elem_meetings is not None:
                meetings = [dict(elem_meeting.items()) for elem_meeting in elem_meetings]
                teachingunit['meetings'] = meetings

            teachingunits.append(teachingunit)
        section['teachingunits'] = teachingunits
        yield section

if __name__ == "__main__":
    import simplejson
    allsections = []
    f = open("sections.json", "w")
    for dept in scrape_depts():
        for course in scrape_courses(dept['key']):
            sections = scrape_sections(dept['key'], course)
            for section in sections:
                section["course"] = course["key"]
                section["dept"] = dept["key"]
                allsections.append(section)
    simplejson.dump(allsections, f)
    f.close()
	#!/usr/bin/python2.7
	import lxml, lxml.etree, urllib2, re

	SESSYR=2012
	SESSCD='W'

	debug_last_url = None

	def read_endpoint(dept=None, course=None, section=None):
	global SESSYR, SESSCD
	global debug_last_url

	url_prefix = "https://courses.students.ubc.ca/cs/servlets/SRVCourseSchedule?sessyr=%d&sesscd=%s&" % (SESSYR, SESSCD)
	if dept is None:
	url = url_prefix + "output=2&req=0"
	elif course is None:
	url = url_prefix + "output=5&req=2&dept=%s" % dept
	elif section is None:
	url = url_prefix + "output=5&req=4&dept=%s&course=%s" % (dept, course)
	else:
	raise Exception("not implemented")
	debug_last_url = url # DEBUG
	result = urllib2.urlopen(url).read().decode('utf8', 'ignore')

	# workaround for anthropology text bugs
	result = re.sub("[\x00-\x08\x0b\x0c\x0e-\x1f]", "", result)

	return lxml.etree.XML(result)

	def scrape_depts():
	for elem_dept in read_endpoint():
	yield dict(elem_dept.items())

	def scrape_courses(dept):
	if type(dept) == dict:
	dept = dept['key']

	for elem_course in read_endpoint(dept):
	yield dict(elem_course.items())

	def scrape_sections(dept, course):
	if type(dept) == dict:
	dept = dept['key']
	if type(course) == dict:
	course = course['key']

	for elem_section in read_endpoint(dept, course):
	section = dict(elem_section.items())
	teachingunits = []
	for elem_teachingunit in elem_section.find("teachingunits"):
	teachingunit = dict(elem_teachingunit.items())

	elem_meetings = elem_teachingunit.find("meetings")
	if elem_meetings is not None:
	meetings = [dict(elem_meeting.items()) for elem_meeting in elem_meetings]
	teachingunit['meetings'] = meetings

	teachingunits.append(teachingunit)
	section['teachingunits'] = teachingunits
	yield section

	if __name__ == "__main__":
	import simplejson
	allsections = []
	f = open("sections.json", "w")
	for dept in scrape_depts():
	for course in scrape_courses(dept['key']):
	sections = scrape_sections(dept['key'], course)
	for section in sections:
	section["course"] = course["key"]
	section["dept"] = dept["key"]
	allsections.append(section)
	simplejson.dump(allsections, f)
	f.close()