an-empty-string/dump_sections.py

## dump_sections.py
"""Schedule-to-JSON converter.

Usage:
    dump_sections.py <term> [-i] [-o <filename>]
    dump_sections.py (-h | --help)

Options:
    -i --include-canceled   Include canceled courses in the data.
    -o --output=<filename>  Send JSON output to a file.

Term examples:
    sprg2017                Spring 2017
    fall2017                Fall 2017
    sum2017a                Summer 2017 academic term
    sum2017b                Summer 2017 session 1
    sum2017c                Summer 2017 session 2
    sum2017mm               Summer 2017 Maymester session
"""

import collections
import docopt
import json
import pprint
import requests
import sys

from typing import Dict, List, Tuple

def decode(b: bytes) -> str:
    return b.decode("utf-8")

def avail_state(b: bytes) -> bool:
    return b != b"Canceled"

def int_or_na(b: bytes):
    if b == b"####":
        return 0

    return int(b)

SEP = b"\x0c"
COURSE_FIELDS = [
        # name            conversion      length      comment
        ("kind"         , decode        , 4)    ,   # online? honors? etc
        ("crn"          , decode        , 6)    ,   # Course Reference Number
        ("num"          , decode        , 3)    ,   # course code
        ("sec"          , decode        , 6)    ,   # section code
        ("title"        , decode        , 30)   ,   # course name
        ("credit"       , float         , 6)    ,   # credit-hours granted
        ("cap"          , int_or_na     , 4)    ,   # enrollment cap
        ("enrolled"     , int           , 4)    ,   # current enrollment
        ("available"    , avail_state   , 8)    ,   # canceled or no?
        ("waitlist"     , int           , 4)    ,   # students on waitlist
        ("days"         , decode        , 7)    ,   # days of week course meets
        ("start"        , decode        , 7)    ,   # start time on days
        ("end"          , decode        , 7)    ,   # end time on days
        ("building"     , decode        , 5)    ,   # ...
        ("room"         , decode        , 10)   ,   # ...
        ("instructor"   , decode        , 35)   ,   # ...
]

COURSE_ATTRIBUTES = [
        ("summer_session_1", "1"),
        ("summer_session_2", "2"),
        ("summer_academic_term", "A"),
        ("distance_learning", "D"),
        ("honors", "H"),
        ("term_j", "J"),
        ("term_k", "K"),
        ("term_l", "L"),
        ("term_m", "M"),
        ("term_n", "N"),
        ("online", "O"),
        ("pass", "P"),
        ("special_offering", "S"),
        ("visiting_student", "V"),
        ("womens_studies", "W"),
        ("hybrid", "Y"),
        # these attributes are set by our code specially
        ("canceled", "="),
]

COPY_ATTRIBUTES = ["crn", "num", "sec", "title", "credit", "cap", "enrolled",
        "waitlist", "building", "room", "instructor"]

ParsedCourse = collections.namedtuple("ParsedCourse",
        ["kind", "crn", "num", "sec", "title", "credit", "cap", "enrolled",
         "available", "waitlist", "days", "start", "end", "building", "room",
         "instructor"])

MeetingTime = collections.namedtuple("MeetingTime", ["days", "start", "end"])

CourseSection = collections.namedtuple("CourseSection",
        ["crn", "dept", "num", "sec", "title", "credit", "cap", "enrolled",
         "waitlist", "meetings", "building", "room", "instructor", "attrs",
         "short"])

CourseAttributes = Dict[str, bool]
DepartmentData = Tuple[str, ParsedCourse]

def get_raw(term) -> bytes:
    url = "http://www.uah.edu/schedules/{}.html".format(term)
    return requests.get(url).content

def get_sections(data: bytes) -> List[bytes]:
    return data.split(SEP)[1:]  # first section is blank

def get_section_code(section: bytes) -> str:
    """ <a name="ACC"> """
    return section.split(b'"', maxsplit=2)[1].decode()

def get_course_data(section: bytes) -> bytes:
    section = section.split(b"<pre>", maxsplit=1)[1]
    section = section.split(b"<HR>", maxsplit=1)[0]
    return section.strip().split(b"\n")[3:]  # three lines of headers

def parse_course(course_line: bytes) -> ParsedCourse:
    collected_fields = {}
    current_index = 0

    for field_name, conversion_func, field_len in COURSE_FIELDS:
        field_len += 1   # include the space separator
        next_index = current_index + field_len
        collected_fields[field_name] = \
                conversion_func(course_line[current_index:next_index].strip())
        current_index = next_index

    return ParsedCourse(**collected_fields)

def parse_attributes(course: ParsedCourse) -> CourseAttributes:
    result = {}
    for attr, char in COURSE_ATTRIBUTES:
        result[attr] = char in course.kind

    return result

def parse_section(section: bytes) -> DepartmentData:
    course_data = get_course_data(section)
    parsed_courses = [parse_course(line) for line in course_data]
    return get_section_code(section), parsed_courses

def interpret_department(data: DepartmentData) -> List[CourseSection]:
    dept_code, parsed_courses = data

    result = []

    crn_courses = collections.defaultdict(list)
    for course in parsed_courses:
        crn_courses[course.crn].append(course)

    for crn, courses in crn_courses.items():
        # we can copy a lot of data from the first course
        new_attributes = {key: getattr(courses[0], key) \
                for key in COPY_ATTRIBUTES}

        # we cannot copy: dept, meetings, attr
        new_attributes["dept"] = dept_code
        new_attributes["attrs"] = parse_attributes(courses[0])

        # we also add: short
        new_attributes["short"] = "{}{}-{}".format(dept_code,
                courses[0].num, courses[0].sec)

        # our only attribute special case: canceled-ness
        if not courses[0].available:
            new_attributes["attrs"]["canceled"] = True

        # the only difference between duplicate courses is meeting times
        meetings = []
        for course in courses:
            if course.days != "TBA":  # nothing to do in this instance...
                meetings.append(MeetingTime(days=course.days,
                    start=course.start, end=course.end))

        new_attributes["meetings"] = meetings

        # merged_course now contains parsed attributes and meeting times
        merged_course = CourseSection(**new_attributes)

        result.append(merged_course)

    return result

def interpret_all(data: List[bytes]) -> List[CourseSection]:
    result = []

    for section in data:
        department_data = parse_section(section)
        course_sections = interpret_department(department_data)
        result.extend(course_sections)

    return result

def drop_canceled(data: List[CourseSection]) -> List[CourseSection]:
    return [course for course in data if not course.attr["canceled"]]

def to_jsonable(data: List[CourseSection]):
    result = []

    for course in data:
        course_json = dict(course._asdict())
        course_json["meetings"] = \
                list(map(lambda meeting: dict(meeting._asdict()),
                         course_json["meetings"]))

        result.append(course_json)

    return result

def dump_data(term, out, include_canceled=False):
    data = get_raw(term)
    sections = get_sections(data)[1:]  # we do not care about the index section
    interpreted = interpret_all(sections)
    jsonable = to_jsonable(interpreted)
    json.dump(jsonable, out)

def main():
    arguments = docopt.docopt(__doc__, version="Schedule-to-JSON converter 0.0.1")
    if arguments["--output"]:
        out = open(arguments["--output"], "w")
    else:
        out = sys.stdout

    dump_data(arguments["<term>"], out, arguments["--include-canceled"])

if __name__ == '__main__':
    main()
	"""Schedule-to-JSON converter.

	Usage:
	dump_sections.py <term> [-i] [-o <filename>]
	dump_sections.py (-h \| --help)

	Options:
	-i --include-canceled Include canceled courses in the data.
	-o --output=<filename> Send JSON output to a file.

	Term examples:
	sprg2017 Spring 2017
	fall2017 Fall 2017
	sum2017a Summer 2017 academic term
	sum2017b Summer 2017 session 1
	sum2017c Summer 2017 session 2
	sum2017mm Summer 2017 Maymester session
	"""

	import collections
	import docopt
	import json
	import pprint
	import requests
	import sys

	from typing import Dict, List, Tuple

	def decode(b: bytes) -> str:
	return b.decode("utf-8")

	def avail_state(b: bytes) -> bool:
	return b != b"Canceled"

	def int_or_na(b: bytes):
	if b == b"####":
	return 0

	return int(b)

	SEP = b"\x0c"
	COURSE_FIELDS = [
	# name conversion length comment
	("kind" , decode , 4) , # online? honors? etc
	("crn" , decode , 6) , # Course Reference Number
	("num" , decode , 3) , # course code
	("sec" , decode , 6) , # section code
	("title" , decode , 30) , # course name
	("credit" , float , 6) , # credit-hours granted
	("cap" , int_or_na , 4) , # enrollment cap
	("enrolled" , int , 4) , # current enrollment
	("available" , avail_state , 8) , # canceled or no?
	("waitlist" , int , 4) , # students on waitlist
	("days" , decode , 7) , # days of week course meets
	("start" , decode , 7) , # start time on days
	("end" , decode , 7) , # end time on days
	("building" , decode , 5) , # ...
	("room" , decode , 10) , # ...
	("instructor" , decode , 35) , # ...
	]

	COURSE_ATTRIBUTES = [
	("summer_session_1", "1"),
	("summer_session_2", "2"),
	("summer_academic_term", "A"),
	("distance_learning", "D"),
	("honors", "H"),
	("term_j", "J"),
	("term_k", "K"),
	("term_l", "L"),
	("term_m", "M"),
	("term_n", "N"),
	("online", "O"),
	("pass", "P"),
	("special_offering", "S"),
	("visiting_student", "V"),
	("womens_studies", "W"),
	("hybrid", "Y"),
	# these attributes are set by our code specially
	("canceled", "="),
	]

	COPY_ATTRIBUTES = ["crn", "num", "sec", "title", "credit", "cap", "enrolled",
	"waitlist", "building", "room", "instructor"]

	ParsedCourse = collections.namedtuple("ParsedCourse",
	["kind", "crn", "num", "sec", "title", "credit", "cap", "enrolled",
	"available", "waitlist", "days", "start", "end", "building", "room",
	"instructor"])

	MeetingTime = collections.namedtuple("MeetingTime", ["days", "start", "end"])

	CourseSection = collections.namedtuple("CourseSection",
	["crn", "dept", "num", "sec", "title", "credit", "cap", "enrolled",
	"waitlist", "meetings", "building", "room", "instructor", "attrs",
	"short"])

	CourseAttributes = Dict[str, bool]
	DepartmentData = Tuple[str, ParsedCourse]

	def get_raw(term) -> bytes:
	url = "http://www.uah.edu/schedules/{}.html".format(term)
	return requests.get(url).content

	def get_sections(data: bytes) -> List[bytes]:
	return data.split(SEP)[1:] # first section is blank

	def get_section_code(section: bytes) -> str:
	""" <a name="ACC"> """
	return section.split(b'"', maxsplit=2)[1].decode()

	def get_course_data(section: bytes) -> bytes:
	section = section.split(b"<pre>", maxsplit=1)[1]
	section = section.split(b"<HR>", maxsplit=1)[0]
	return section.strip().split(b"\n")[3:] # three lines of headers

	def parse_course(course_line: bytes) -> ParsedCourse:
	collected_fields = {}
	current_index = 0

	for field_name, conversion_func, field_len in COURSE_FIELDS:
	field_len += 1 # include the space separator
	next_index = current_index + field_len
	collected_fields[field_name] = \
	conversion_func(course_line[current_index:next_index].strip())
	current_index = next_index

	return ParsedCourse(**collected_fields)

	def parse_attributes(course: ParsedCourse) -> CourseAttributes:
	result = {}
	for attr, char in COURSE_ATTRIBUTES:
	result[attr] = char in course.kind

	return result

	def parse_section(section: bytes) -> DepartmentData:
	course_data = get_course_data(section)
	parsed_courses = [parse_course(line) for line in course_data]
	return get_section_code(section), parsed_courses

	def interpret_department(data: DepartmentData) -> List[CourseSection]:
	dept_code, parsed_courses = data

	result = []

	crn_courses = collections.defaultdict(list)
	for course in parsed_courses:
	crn_courses[course.crn].append(course)

	for crn, courses in crn_courses.items():
	# we can copy a lot of data from the first course
	new_attributes = {key: getattr(courses[0], key) \
	for key in COPY_ATTRIBUTES}

	# we cannot copy: dept, meetings, attr
	new_attributes["dept"] = dept_code
	new_attributes["attrs"] = parse_attributes(courses[0])

	# we also add: short
	new_attributes["short"] = "{}{}-{}".format(dept_code,
	courses[0].num, courses[0].sec)

	# our only attribute special case: canceled-ness
	if not courses[0].available:
	new_attributes["attrs"]["canceled"] = True

	# the only difference between duplicate courses is meeting times
	meetings = []
	for course in courses:
	if course.days != "TBA": # nothing to do in this instance...
	meetings.append(MeetingTime(days=course.days,
	start=course.start, end=course.end))

	new_attributes["meetings"] = meetings

	# merged_course now contains parsed attributes and meeting times
	merged_course = CourseSection(**new_attributes)

	result.append(merged_course)

	return result

	def interpret_all(data: List[bytes]) -> List[CourseSection]:
	result = []

	for section in data:
	department_data = parse_section(section)
	course_sections = interpret_department(department_data)
	result.extend(course_sections)

	return result

	def drop_canceled(data: List[CourseSection]) -> List[CourseSection]:
	return [course for course in data if not course.attr["canceled"]]

	def to_jsonable(data: List[CourseSection]):
	result = []

	for course in data:
	course_json = dict(course._asdict())
	course_json["meetings"] = \
	list(map(lambda meeting: dict(meeting._asdict()),
	course_json["meetings"]))

	result.append(course_json)

	return result

	def dump_data(term, out, include_canceled=False):
	data = get_raw(term)
	sections = get_sections(data)[1:] # we do not care about the index section
	interpreted = interpret_all(sections)
	jsonable = to_jsonable(interpreted)
	json.dump(jsonable, out)

	def main():
	arguments = docopt.docopt(__doc__, version="Schedule-to-JSON converter 0.0.1")
	if arguments["--output"]:
	out = open(arguments["--output"], "w")
	else:
	out = sys.stdout

	dump_data(arguments["<term>"], out, arguments["--include-canceled"])

	if __name__ == '__main__':
	main()