Skip to content

Instantly share code, notes, and snippets.

@joostrijneveld
Created August 25, 2013 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joostrijneveld/6334082 to your computer and use it in GitHub Desktop.
Save joostrijneveld/6334082 to your computer and use it in GitHub Desktop.
Downloads the course index of a Radboud prospectus and groups courses per quarter / semester.
#! /usr/bin/env python
import argparse
import urllib2
import re
from bs4 import BeautifulSoup
def setup_args():
parser = argparse.ArgumentParser(description='Downloads the course index of a Radboud prospectus and groups courses per quarter / semester.')
parser.add_argument('courselistURL',
help='URL of the course list',
nargs='?',
default='http://www.studiegids.science.ru.nl/2013/science/prospectus/computing_science_master/courses/')
return parser.parse_args()
def yn_choice(message, default='y'):
''' source: http://stackoverflow.com/a/4741730/1711232 '''
choices = 'Y/n' if default.lower() in ('y', 'yes') else 'y/N'
choice = raw_input("%s (%s) " % (message, choices))
values = ('y', 'yes', '') if default == 'y' else ('y', 'yes')
return choice.strip().lower() in values
def fetch_courses(url):
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
thead = soup.find(text=re.compile("Gesorteerd op vaknaam"))
table = thead.parent
courses = []
for course in table.find_next_sibling('table').find_all('a'):
href = course.get('href')
if href[0] == '/': #urls *sometimes* lack the domain
href = "http://www.studiegids.science.ru.nl" + href
courses.append((course.string, href))
return courses
def get_q_by_url(url):
html = urllib2.urlopen(url).read()
soup = BeautifulSoup(html)
scheduled = soup.find(text=re.compile("Scheduled|Periode"))
if scheduled == None:
return "Unknown"
td = scheduled.parent.parent
return list(td.children)[-1].strip()
def main(args):
result = dict()
courses = fetch_courses(args.courselistURL)
if yn_choice("About to fetch "+str(len(courses))+" course pages. Continue?"):
for course, url in courses:
time = get_q_by_url(url)
if time not in result:
result[time] = [course]
else:
result[time].append(course)
for timeslot in result:
print '=====', timeslot, '====='
for course in result[timeslot]:
print course
if __name__ == "__main__":
args = setup_args()
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment