Created
August 25, 2013 14:18
-
-
Save joostrijneveld/6334082 to your computer and use it in GitHub Desktop.
Downloads the course index of a Radboud prospectus and groups courses per quarter / semester.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import argparse | |
import urllib2 | |
import re | |
from bs4 import BeautifulSoup | |
def setup_args(): | |
parser = argparse.ArgumentParser(description='Downloads the course index of a Radboud prospectus and groups courses per quarter / semester.') | |
parser.add_argument('courselistURL', | |
help='URL of the course list', | |
nargs='?', | |
default='http://www.studiegids.science.ru.nl/2013/science/prospectus/computing_science_master/courses/') | |
return parser.parse_args() | |
def yn_choice(message, default='y'): | |
''' source: http://stackoverflow.com/a/4741730/1711232 ''' | |
choices = 'Y/n' if default.lower() in ('y', 'yes') else 'y/N' | |
choice = raw_input("%s (%s) " % (message, choices)) | |
values = ('y', 'yes', '') if default == 'y' else ('y', 'yes') | |
return choice.strip().lower() in values | |
def fetch_courses(url): | |
html = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(html) | |
thead = soup.find(text=re.compile("Gesorteerd op vaknaam")) | |
table = thead.parent | |
courses = [] | |
for course in table.find_next_sibling('table').find_all('a'): | |
href = course.get('href') | |
if href[0] == '/': #urls *sometimes* lack the domain | |
href = "http://www.studiegids.science.ru.nl" + href | |
courses.append((course.string, href)) | |
return courses | |
def get_q_by_url(url): | |
html = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(html) | |
scheduled = soup.find(text=re.compile("Scheduled|Periode")) | |
if scheduled == None: | |
return "Unknown" | |
td = scheduled.parent.parent | |
return list(td.children)[-1].strip() | |
def main(args): | |
result = dict() | |
courses = fetch_courses(args.courselistURL) | |
if yn_choice("About to fetch "+str(len(courses))+" course pages. Continue?"): | |
for course, url in courses: | |
time = get_q_by_url(url) | |
if time not in result: | |
result[time] = [course] | |
else: | |
result[time].append(course) | |
for timeslot in result: | |
print '=====', timeslot, '=====' | |
for course in result[timeslot]: | |
print course | |
if __name__ == "__main__": | |
args = setup_args() | |
main(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment