kennydude/README.markdown

## README.markdown

      
    Raw
  

              README.markdown
            
          
    Usage:
Call with python uni.py on the command line and it will ask the university. Then follow on-screen instructions
Supported:

Newcastle University (ncl). Should work for all courses, only tested on Computing
Birmingham University Computing School. Only this department because of the way it's organized

Will output in plain unstyled HTML with each module's Aims and Syllabus instead of manually doing it.
Requirements

Python 2.7
BeautifulSoup 3
urllib2 (included in default py distro)


## uni.py
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
import urllib2, copy

print "University Module Scraper"
print "------"
print "@kennydude 2012"
print "All scripts correct as of date written"
print "------"

def fetch_page(page):
	page = urllib2.urlopen(page)
	return BeautifulSoup(page)

def save_out(contents, default=""):
	print "Saving. Press enter to use '%s'" % default
	filename = raw_input("Filename: ")
	if filename == "":
		filename = default
	f = open(filename, "w")
	f.write(contents)
	f.close()
	print "Saved to %s" % filename

def newcastle():
	print "Newcastle Plugin"
	print "Degree number is found like so:"
	print "http://www.ncl.ac.uk/undergraduate/degrees/g600/"
	print "                                           ^ this part"
	degree = raw_input("Degree number: ")
	print "Thank you. Please wait this could take some time...."

	modules = fetch_page("http://www.ncl.ac.uk/undergraduate/degrees/%s/modules/" % degree)
	page = modules.find("div", { "id" : "contentArea" })
	print "> Basic page found for degree"

	# Remove shit
	page.find("ul", { "class": "contentMenu" }).extract()
	for script in page.findAll("script"): script.extract()
	page.find("a", {"class" : "skip"}).extract()

	for module in page.findAll("a"):
		print "> Fetching '%s' details..." % module.string
		url = module['href']
		if "ncl.ac.uk" not in url:
			url = "http://www.ncl.ac.uk/%s" % url
		modpage = fetch_page(url)
		moddetail = Tag(modules, "div")

		title = Tag(modules, "h3")
		title.insert(0, NavigableString(module.string))
		moddetail.insert(0, title)

		aims = modpage.find("h4", text="Aims")
		moddetail.insert(1, aims.parent.parent)

		syl = modpage.find("h4", text="Outline Of Syllabus")
		moddetail.insert(2, syl.parent.parent)
		module.replaceWith(moddetail)

	save_out(page.prettify(), "ncl-%s.html" % degree)

def brimingham():
	print "Birmingham University"
	print "School of Computer Science"
	print "You need the URL Like so: http://www.cs.bham.ac.uk/admissions/undergraduate/se.php"
	url = raw_input("URL: ")
	print "Thank you. This may take some time..."

	course = fetch_page(url)
	page = BeautifulSoup()
	h = course.find(text="Course Modules")
	i = 0
	print "> Basic page found..."

	for el in h.parent.findNextSiblings():
		i+=1
		if el.name == "ul":
			# List of modules
			for module in el.findAll("a"):
				print "> Loading module '%s' " % str(module.string)
				url = module['href']
				if 'http' not in url:
					url = "http://www.cs.bham.ac.uk/%s" % url
				modpage = fetch_page(url)
				moddetail = Tag(page, "div")

				title = Tag(page, "h2")
				title.insert(0, NavigableString(module.string))
				moddetail.insert(0, title)

				syl = modpage.find("a", {"name":"DetailedSyllabus"}).parent
				sylco = syl.nextSibling
				moddetail.insert(1, syl)
				moddetail.insert(2, sylco)

				aims = modpage.find("a", {"name":"Aims"}).parent
				k = 4
				for aim in aims.findNextSiblings():
					if aim.name == "h2":
						break
					moddetail.insert(k, aim)
					k+=1
				moddetail.insert(3, copy.copy(aims))
				module.replaceWith(moddetail)

		page.insert(i, el)
	page.insert(i, h.parent)
	save_out(page.prettify(), "bham-modules.html")

unis = {
	"ncl" : newcastle,
	"bham" : brimingham
}

print "Universities available: ", unis.keys()

uni = raw_input("University: ")
if uni in unis:
	unis[uni]()
else:
	print "Not found. Exiting"
	from BeautifulSoup import BeautifulSoup, Tag, NavigableString
	import urllib2, copy

	print "University Module Scraper"
	print "------"
	print "@kennydude 2012"
	print "All scripts correct as of date written"
	print "------"

	def fetch_page(page):
	page = urllib2.urlopen(page)
	return BeautifulSoup(page)

	def save_out(contents, default=""):
	print "Saving. Press enter to use '%s'" % default
	filename = raw_input("Filename: ")
	if filename == "":
	filename = default
	f = open(filename, "w")
	f.write(contents)
	f.close()
	print "Saved to %s" % filename

	def newcastle():
	print "Newcastle Plugin"
	print "Degree number is found like so:"
	print "http://www.ncl.ac.uk/undergraduate/degrees/g600/"
	print " ^ this part"
	degree = raw_input("Degree number: ")
	print "Thank you. Please wait this could take some time...."

	modules = fetch_page("http://www.ncl.ac.uk/undergraduate/degrees/%s/modules/" % degree)
	page = modules.find("div", { "id" : "contentArea" })
	print "> Basic page found for degree"

	# Remove shit
	page.find("ul", { "class": "contentMenu" }).extract()
	for script in page.findAll("script"): script.extract()
	page.find("a", {"class" : "skip"}).extract()

	for module in page.findAll("a"):
	print "> Fetching '%s' details..." % module.string
	url = module['href']
	if "ncl.ac.uk" not in url:
	url = "http://www.ncl.ac.uk/%s" % url
	modpage = fetch_page(url)
	moddetail = Tag(modules, "div")

	title = Tag(modules, "h3")
	title.insert(0, NavigableString(module.string))
	moddetail.insert(0, title)

	aims = modpage.find("h4", text="Aims")
	moddetail.insert(1, aims.parent.parent)

	syl = modpage.find("h4", text="Outline Of Syllabus")
	moddetail.insert(2, syl.parent.parent)
	module.replaceWith(moddetail)

	save_out(page.prettify(), "ncl-%s.html" % degree)

	def brimingham():
	print "Birmingham University"
	print "School of Computer Science"
	print "You need the URL Like so: http://www.cs.bham.ac.uk/admissions/undergraduate/se.php"
	url = raw_input("URL: ")
	print "Thank you. This may take some time..."

	course = fetch_page(url)
	page = BeautifulSoup()
	h = course.find(text="Course Modules")
	i = 0
	print "> Basic page found..."

	for el in h.parent.findNextSiblings():
	i+=1
	if el.name == "ul":
	# List of modules
	for module in el.findAll("a"):
	print "> Loading module '%s' " % str(module.string)
	url = module['href']
	if 'http' not in url:
	url = "http://www.cs.bham.ac.uk/%s" % url
	modpage = fetch_page(url)
	moddetail = Tag(page, "div")

	title = Tag(page, "h2")
	title.insert(0, NavigableString(module.string))
	moddetail.insert(0, title)

	syl = modpage.find("a", {"name":"DetailedSyllabus"}).parent
	sylco = syl.nextSibling
	moddetail.insert(1, syl)
	moddetail.insert(2, sylco)

	aims = modpage.find("a", {"name":"Aims"}).parent
	k = 4
	for aim in aims.findNextSiblings():
	if aim.name == "h2":
	break
	moddetail.insert(k, aim)
	k+=1
	moddetail.insert(3, copy.copy(aims))
	module.replaceWith(moddetail)

	page.insert(i, el)
	page.insert(i, h.parent)
	save_out(page.prettify(), "bham-modules.html")

	unis = {
	"ncl" : newcastle,
	"bham" : brimingham
	}

	print "Universities available: ", unis.keys()

	uni = raw_input("University: ")
	if uni in unis:
	unis[uni]()
	else:
	print "Not found. Exiting"