Skip to content

Instantly share code, notes, and snippets.

@kennydude
Created June 9, 2012 14:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kennydude/2901124 to your computer and use it in GitHub Desktop.
Save kennydude/2901124 to your computer and use it in GitHub Desktop.
University course pulers

Usage:

Call with python uni.py on the command line and it will ask the university. Then follow on-screen instructions

Supported:

  • Newcastle University (ncl). Should work for all courses, only tested on Computing
  • Birmingham University Computing School. Only this department because of the way it's organized

Will output in plain unstyled HTML with each module's Aims and Syllabus instead of manually doing it.

Requirements

  • Python 2.7
  • BeautifulSoup 3
  • urllib2 (included in default py distro)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import urllib2, copy
print "University Module Scraper"
print "------"
print "@kennydude 2012"
print "All scripts correct as of date written"
print "------"
def fetch_page(page):
page = urllib2.urlopen(page)
return BeautifulSoup(page)
def save_out(contents, default=""):
print "Saving. Press enter to use '%s'" % default
filename = raw_input("Filename: ")
if filename == "":
filename = default
f = open(filename, "w")
f.write(contents)
f.close()
print "Saved to %s" % filename
def newcastle():
print "Newcastle Plugin"
print "Degree number is found like so:"
print "http://www.ncl.ac.uk/undergraduate/degrees/g600/"
print " ^ this part"
degree = raw_input("Degree number: ")
print "Thank you. Please wait this could take some time...."
modules = fetch_page("http://www.ncl.ac.uk/undergraduate/degrees/%s/modules/" % degree)
page = modules.find("div", { "id" : "contentArea" })
print "> Basic page found for degree"
# Remove shit
page.find("ul", { "class": "contentMenu" }).extract()
for script in page.findAll("script"): script.extract()
page.find("a", {"class" : "skip"}).extract()
for module in page.findAll("a"):
print "> Fetching '%s' details..." % module.string
url = module['href']
if "ncl.ac.uk" not in url:
url = "http://www.ncl.ac.uk/%s" % url
modpage = fetch_page(url)
moddetail = Tag(modules, "div")
title = Tag(modules, "h3")
title.insert(0, NavigableString(module.string))
moddetail.insert(0, title)
aims = modpage.find("h4", text="Aims")
moddetail.insert(1, aims.parent.parent)
syl = modpage.find("h4", text="Outline Of Syllabus")
moddetail.insert(2, syl.parent.parent)
module.replaceWith(moddetail)
save_out(page.prettify(), "ncl-%s.html" % degree)
def brimingham():
print "Birmingham University"
print "School of Computer Science"
print "You need the URL Like so: http://www.cs.bham.ac.uk/admissions/undergraduate/se.php"
url = raw_input("URL: ")
print "Thank you. This may take some time..."
course = fetch_page(url)
page = BeautifulSoup()
h = course.find(text="Course Modules")
i = 0
print "> Basic page found..."
for el in h.parent.findNextSiblings():
i+=1
if el.name == "ul":
# List of modules
for module in el.findAll("a"):
print "> Loading module '%s' " % str(module.string)
url = module['href']
if 'http' not in url:
url = "http://www.cs.bham.ac.uk/%s" % url
modpage = fetch_page(url)
moddetail = Tag(page, "div")
title = Tag(page, "h2")
title.insert(0, NavigableString(module.string))
moddetail.insert(0, title)
syl = modpage.find("a", {"name":"DetailedSyllabus"}).parent
sylco = syl.nextSibling
moddetail.insert(1, syl)
moddetail.insert(2, sylco)
aims = modpage.find("a", {"name":"Aims"}).parent
k = 4
for aim in aims.findNextSiblings():
if aim.name == "h2":
break
moddetail.insert(k, aim)
k+=1
moddetail.insert(3, copy.copy(aims))
module.replaceWith(moddetail)
page.insert(i, el)
page.insert(i, h.parent)
save_out(page.prettify(), "bham-modules.html")
unis = {
"ncl" : newcastle,
"bham" : brimingham
}
print "Universities available: ", unis.keys()
uni = raw_input("University: ")
if uni in unis:
unis[uni]()
else:
print "Not found. Exiting"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment