|
from BeautifulSoup import BeautifulSoup, Tag, NavigableString |
|
import urllib2, copy |
|
|
|
print "University Module Scraper" |
|
print "------" |
|
print "@kennydude 2012" |
|
print "All scripts correct as of date written" |
|
print "------" |
|
|
|
def fetch_page(page): |
|
page = urllib2.urlopen(page) |
|
return BeautifulSoup(page) |
|
|
|
def save_out(contents, default=""): |
|
print "Saving. Press enter to use '%s'" % default |
|
filename = raw_input("Filename: ") |
|
if filename == "": |
|
filename = default |
|
f = open(filename, "w") |
|
f.write(contents) |
|
f.close() |
|
print "Saved to %s" % filename |
|
|
|
def newcastle(): |
|
print "Newcastle Plugin" |
|
print "Degree number is found like so:" |
|
print "http://www.ncl.ac.uk/undergraduate/degrees/g600/" |
|
print " ^ this part" |
|
degree = raw_input("Degree number: ") |
|
print "Thank you. Please wait this could take some time...." |
|
|
|
modules = fetch_page("http://www.ncl.ac.uk/undergraduate/degrees/%s/modules/" % degree) |
|
page = modules.find("div", { "id" : "contentArea" }) |
|
print "> Basic page found for degree" |
|
|
|
# Remove shit |
|
page.find("ul", { "class": "contentMenu" }).extract() |
|
for script in page.findAll("script"): script.extract() |
|
page.find("a", {"class" : "skip"}).extract() |
|
|
|
for module in page.findAll("a"): |
|
print "> Fetching '%s' details..." % module.string |
|
url = module['href'] |
|
if "ncl.ac.uk" not in url: |
|
url = "http://www.ncl.ac.uk/%s" % url |
|
modpage = fetch_page(url) |
|
moddetail = Tag(modules, "div") |
|
|
|
title = Tag(modules, "h3") |
|
title.insert(0, NavigableString(module.string)) |
|
moddetail.insert(0, title) |
|
|
|
aims = modpage.find("h4", text="Aims") |
|
moddetail.insert(1, aims.parent.parent) |
|
|
|
syl = modpage.find("h4", text="Outline Of Syllabus") |
|
moddetail.insert(2, syl.parent.parent) |
|
module.replaceWith(moddetail) |
|
|
|
save_out(page.prettify(), "ncl-%s.html" % degree) |
|
|
|
def brimingham(): |
|
print "Birmingham University" |
|
print "School of Computer Science" |
|
print "You need the URL Like so: http://www.cs.bham.ac.uk/admissions/undergraduate/se.php" |
|
url = raw_input("URL: ") |
|
print "Thank you. This may take some time..." |
|
|
|
course = fetch_page(url) |
|
page = BeautifulSoup() |
|
h = course.find(text="Course Modules") |
|
i = 0 |
|
print "> Basic page found..." |
|
|
|
for el in h.parent.findNextSiblings(): |
|
i+=1 |
|
if el.name == "ul": |
|
# List of modules |
|
for module in el.findAll("a"): |
|
print "> Loading module '%s' " % str(module.string) |
|
url = module['href'] |
|
if 'http' not in url: |
|
url = "http://www.cs.bham.ac.uk/%s" % url |
|
modpage = fetch_page(url) |
|
moddetail = Tag(page, "div") |
|
|
|
title = Tag(page, "h2") |
|
title.insert(0, NavigableString(module.string)) |
|
moddetail.insert(0, title) |
|
|
|
syl = modpage.find("a", {"name":"DetailedSyllabus"}).parent |
|
sylco = syl.nextSibling |
|
moddetail.insert(1, syl) |
|
moddetail.insert(2, sylco) |
|
|
|
aims = modpage.find("a", {"name":"Aims"}).parent |
|
k = 4 |
|
for aim in aims.findNextSiblings(): |
|
if aim.name == "h2": |
|
break |
|
moddetail.insert(k, aim) |
|
k+=1 |
|
moddetail.insert(3, copy.copy(aims)) |
|
module.replaceWith(moddetail) |
|
|
|
page.insert(i, el) |
|
page.insert(i, h.parent) |
|
save_out(page.prettify(), "bham-modules.html") |
|
|
|
unis = { |
|
"ncl" : newcastle, |
|
"bham" : brimingham |
|
} |
|
|
|
print "Universities available: ", unis.keys() |
|
|
|
uni = raw_input("University: ") |
|
if uni in unis: |
|
unis[uni]() |
|
else: |
|
print "Not found. Exiting" |