Skip to content

Instantly share code, notes, and snippets.

@an-empty-string
Created June 29, 2017 07:03
Show Gist options
  • Save an-empty-string/a09850b83c253ca7c0cc8d369ba263c2 to your computer and use it in GitHub Desktop.
Save an-empty-string/a09850b83c253ca7c0cc8d369ba263c2 to your computer and use it in GitHub Desktop.
schedule data converter
"""Schedule-to-JSON converter.
Usage:
dump_sections.py <term> [-i] [-o <filename>]
dump_sections.py (-h | --help)
Options:
-i --include-canceled Include canceled courses in the data.
-o --output=<filename> Send JSON output to a file.
Term examples:
sprg2017 Spring 2017
fall2017 Fall 2017
sum2017a Summer 2017 academic term
sum2017b Summer 2017 session 1
sum2017c Summer 2017 session 2
sum2017mm Summer 2017 Maymester session
"""
import collections
import docopt
import json
import pprint
import requests
import sys
from typing import Dict, List, Tuple
def decode(b: bytes) -> str:
return b.decode("utf-8")
def avail_state(b: bytes) -> bool:
return b != b"Canceled"
def int_or_na(b: bytes):
if b == b"####":
return 0
return int(b)
SEP = b"\x0c"
COURSE_FIELDS = [
# name conversion length comment
("kind" , decode , 4) , # online? honors? etc
("crn" , decode , 6) , # Course Reference Number
("num" , decode , 3) , # course code
("sec" , decode , 6) , # section code
("title" , decode , 30) , # course name
("credit" , float , 6) , # credit-hours granted
("cap" , int_or_na , 4) , # enrollment cap
("enrolled" , int , 4) , # current enrollment
("available" , avail_state , 8) , # canceled or no?
("waitlist" , int , 4) , # students on waitlist
("days" , decode , 7) , # days of week course meets
("start" , decode , 7) , # start time on days
("end" , decode , 7) , # end time on days
("building" , decode , 5) , # ...
("room" , decode , 10) , # ...
("instructor" , decode , 35) , # ...
]
COURSE_ATTRIBUTES = [
("summer_session_1", "1"),
("summer_session_2", "2"),
("summer_academic_term", "A"),
("distance_learning", "D"),
("honors", "H"),
("term_j", "J"),
("term_k", "K"),
("term_l", "L"),
("term_m", "M"),
("term_n", "N"),
("online", "O"),
("pass", "P"),
("special_offering", "S"),
("visiting_student", "V"),
("womens_studies", "W"),
("hybrid", "Y"),
# these attributes are set by our code specially
("canceled", "="),
]
COPY_ATTRIBUTES = ["crn", "num", "sec", "title", "credit", "cap", "enrolled",
"waitlist", "building", "room", "instructor"]
ParsedCourse = collections.namedtuple("ParsedCourse",
["kind", "crn", "num", "sec", "title", "credit", "cap", "enrolled",
"available", "waitlist", "days", "start", "end", "building", "room",
"instructor"])
MeetingTime = collections.namedtuple("MeetingTime", ["days", "start", "end"])
CourseSection = collections.namedtuple("CourseSection",
["crn", "dept", "num", "sec", "title", "credit", "cap", "enrolled",
"waitlist", "meetings", "building", "room", "instructor", "attrs",
"short"])
CourseAttributes = Dict[str, bool]
DepartmentData = Tuple[str, ParsedCourse]
def get_raw(term) -> bytes:
url = "http://www.uah.edu/schedules/{}.html".format(term)
return requests.get(url).content
def get_sections(data: bytes) -> List[bytes]:
return data.split(SEP)[1:] # first section is blank
def get_section_code(section: bytes) -> str:
""" <a name="ACC"> """
return section.split(b'"', maxsplit=2)[1].decode()
def get_course_data(section: bytes) -> bytes:
section = section.split(b"<pre>", maxsplit=1)[1]
section = section.split(b"<HR>", maxsplit=1)[0]
return section.strip().split(b"\n")[3:] # three lines of headers
def parse_course(course_line: bytes) -> ParsedCourse:
collected_fields = {}
current_index = 0
for field_name, conversion_func, field_len in COURSE_FIELDS:
field_len += 1 # include the space separator
next_index = current_index + field_len
collected_fields[field_name] = \
conversion_func(course_line[current_index:next_index].strip())
current_index = next_index
return ParsedCourse(**collected_fields)
def parse_attributes(course: ParsedCourse) -> CourseAttributes:
result = {}
for attr, char in COURSE_ATTRIBUTES:
result[attr] = char in course.kind
return result
def parse_section(section: bytes) -> DepartmentData:
course_data = get_course_data(section)
parsed_courses = [parse_course(line) for line in course_data]
return get_section_code(section), parsed_courses
def interpret_department(data: DepartmentData) -> List[CourseSection]:
dept_code, parsed_courses = data
result = []
crn_courses = collections.defaultdict(list)
for course in parsed_courses:
crn_courses[course.crn].append(course)
for crn, courses in crn_courses.items():
# we can copy a lot of data from the first course
new_attributes = {key: getattr(courses[0], key) \
for key in COPY_ATTRIBUTES}
# we cannot copy: dept, meetings, attr
new_attributes["dept"] = dept_code
new_attributes["attrs"] = parse_attributes(courses[0])
# we also add: short
new_attributes["short"] = "{}{}-{}".format(dept_code,
courses[0].num, courses[0].sec)
# our only attribute special case: canceled-ness
if not courses[0].available:
new_attributes["attrs"]["canceled"] = True
# the only difference between duplicate courses is meeting times
meetings = []
for course in courses:
if course.days != "TBA": # nothing to do in this instance...
meetings.append(MeetingTime(days=course.days,
start=course.start, end=course.end))
new_attributes["meetings"] = meetings
# merged_course now contains parsed attributes and meeting times
merged_course = CourseSection(**new_attributes)
result.append(merged_course)
return result
def interpret_all(data: List[bytes]) -> List[CourseSection]:
result = []
for section in data:
department_data = parse_section(section)
course_sections = interpret_department(department_data)
result.extend(course_sections)
return result
def drop_canceled(data: List[CourseSection]) -> List[CourseSection]:
return [course for course in data if not course.attr["canceled"]]
def to_jsonable(data: List[CourseSection]):
result = []
for course in data:
course_json = dict(course._asdict())
course_json["meetings"] = \
list(map(lambda meeting: dict(meeting._asdict()),
course_json["meetings"]))
result.append(course_json)
return result
def dump_data(term, out, include_canceled=False):
data = get_raw(term)
sections = get_sections(data)[1:] # we do not care about the index section
interpreted = interpret_all(sections)
jsonable = to_jsonable(interpreted)
json.dump(jsonable, out)
def main():
arguments = docopt.docopt(__doc__, version="Schedule-to-JSON converter 0.0.1")
if arguments["--output"]:
out = open(arguments["--output"], "w")
else:
out = sys.stdout
dump_data(arguments["<term>"], out, arguments["--include-canceled"])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment