Skip to content

Instantly share code, notes, and snippets.

@abstrctn
Created April 26, 2011 04:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abstrctn/941775 to your computer and use it in GitHub Desktop.
Save abstrctn/941775 to your computer and use it in GitHub Desktop.
nyu.OpenCourseSearch.org Scraper
import cookielib, urllib2, urllib, BeautifulSoup, re, time, pickle
#from courses.helpers import *
"""
This class scrapes NYU's course registration listings on Albert,
and writes the pickled data out to a specified file directory.
Shouldn't require a working NYU Net ID to work, but adding it is worth a shot
if something seems broken.
Can whip up a command-line version if anyone wants it.
>>> from scraper import Scraper
>>> s = Scraper('/opt/storage/')
>>> s.run()
>>> s = Scraper('/opt/storage/', '$username', '$password')
>>> s.run()
# (optional) scrape only a slice of the subjects
>>> s = Scraper('/opt/storage/', '$username', '$password', 0, 10)
>>> s.run()
Data is pickled per-course category as follows:
classes = {
'$course_id': {
'class_name': 'Animal Minds',
'classification': 'ANST-UA',
'college': 'College of Arts and Science',
'component': 'Lecture',
'course_name': 'Topics is AS',
'description': 'This course analyzes the ways that...',
'grading': 'CAS Graded',
'is_open': 'Open',
'level': 'Undergraduate',
'loc_code': 'WS',
'meet_data': '09/06/2011 - 12/23/2011 Mon,Wed 11.00 AM - 12.15 PM with Sebo, Jeffrey',
'notes': 'Open only to ANST minors during the first...',
'number': '600',
'section': '001',
'session': '09/06/2011 - 12/16/2011',
},
...
}
"""
class Scraper:
def __init__(self, storage_dir, username=None, password=None, start=0, end=None):
self.ICSID = ''
self.sections = None
self.section_data = {}
self.classes = {}
self.hold = None
self.DUMP_DIR = storage_dir
self.username = username
self.password = password
self.start_index = start
if end:
self.end_index = start + end
else:
self.end_index = None
self.login_url = 'https://admin.portal.nyu.edu/psp/paprod/EMPLOYEE/EMPL/?cmd=login'
self.scrape_url = 'https://sis.nyu.edu/psc/csprod/EMPLOYEE/CSSS/c/NYU_SR.NYU_CLS_SRCH.GBL'
self.last_section = ''
self.longs = []
self.cj = cookielib.CookieJar()
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cj))
def _make_connection(self):
try:
if self.username and self.password:
self.log_in()
print "touching home..."
self.access_home()
except:
print "Server is not responding. Try quitting and running again..."
print "retrieving course sections..."
self.get_all_sections()
def run(self, forever=False):
self._make_connection()
if self.end_index:
end = self.end_index
else:
end = len(self.sections)
while True:
for i in range(self.start_index, end):
print "Section %s." % i
self.process_section(i)
if not forever:
return
def log_in(self):
params = {
'timezoneOffset': '240',
'httpPort': '',
'userid': self.username,
'pwd': self.password,
'Submit': 'Sign In'
}
data = urllib.urlencode(params)
# open the log in page
r = self.opener.open(self.login_url)
# log in
r = self.opener.open(self.login_url, data)
page = r.read()
print "logged in."
def access_home(self):
r = self.opener.open(self.scrape_url)
page = r.read()
soup = BeautifulSoup.BeautifulSoup(page)
self.ICSID = soup.find('input', {'type': 'hidden', 'name': 'ICSID'})['value']
print "Browsing key found: %s" % self.ICSID
def get_home(self):
params = {
'ICAction': 'NYU_CLS_DERIVED_BACK',
'ICChanged': '-1',
'ICElementNum': '0',
'ICFocus': '',
'ICResubmit': '0',
'ICSID': self.ICSID,
'ICSaveWarningFilter': '0',
'ICStateNum': '1',
'ICType': 'Panel',
'ICXPos': '0',
'ICYPos': '0',
'NYU_CLS_DERIVED_DESCR100': '',
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '',
'NYU_CLS_WRK_NYU_FALL': 'Y',
'NYU_CLS_WRK_NYU_FALL$chk': 'Y',
}
data = urllib.urlencode(params)
r = self.opener.open(self.scrape_url, data)
page = r.read()
return BeautifulSoup.BeautifulSoup(page)
def get_all_sections(self):
"""
Gathers list of all colleges, and their corresponding class sections.
Writes full list out to 'colleges-classifications.txt':
data = [
["College of Arts and Science", "Biology", "BIOL-UA"],
...
]
"""
soup = self.get_home()
divs = soup.findAll('a', {'class': 'SSSAZLINK'})
ret = []
for i in divs:
ret.append([i['id'], i.contents[0]])
self.sections = ret[1:]
print "%s sections loaded" % (len(self.sections))
# get colleges, classifications
data = []
ret = []
colleges = soup.findAll('td', {'class': 'SSSGROUPBOXLEFTLABEL'})
for c in colleges:
college = c.contents[1]
college = college.replace(' ', '')
table = c.findNext('table', {'class': 'SSSGROUPBOXLEFT'})
classifications = table.findAll('a', {'class': 'SSSAZLINK'})
for cls in classifications:
name = " ".join(["%s" % i for i in cls.contents])
name = name.replace('<br />', '')
name = name.replace('&amp;', '&')
try:
course_name, course_code = re.search('(.*)\s\((.+)\)', name).groups()
data.append([college, course_name, course_code])
except:
print name
f = open('%scollege-classifications.txt' % (self.DUMP_DIR), 'w')
pickle.dump(data, f)
f.close()
def _confirm_long_listing(self, ICStateNum):
# basically, press "yes" to confirm showing more than 100 results
params = {
'ICAction': "#ICYes",
'ICChanged': '-1',
'ICElementNum': '0',
'ICFocus': '',
'ICResubmit': '0',
'ICSID': self.ICSID,
'ICSaveWarningFilter': '0',
'ICStateNum': ICStateNum,
'ICType': 'Panel',
'ICXPos': '0',
'ICYPos': '0',
'NYU_CLS_DERIVED_DESCR100': '',
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '',
'NYU_CLS_WRK_NYU_FALL': 'Y',
'NYU_CLS_WRK_NYU_FALL$chk': 'Y',
}
data = urllib.urlencode(params)
r = self.opener.open(self.scrape_url, data)
page = r.read()
return BeautifulSoup.BeautifulSoup(page)
def get_section_listing(self, id, current_only='Y'):
params = {
'ICAction': id,
'ICChanged': '-1',
'ICElementNum': '0',
'ICFocus': '',
'ICResubmit': '0',
'ICSID': self.ICSID,
'ICSaveWarningFilter': '0',
'ICStateNum': '1',
'ICType': 'Panel',
'ICXPos': '0',
'ICYPos': '0',
'NYU_CLS_WRK_NYU_FALL': current_only,
'NYU_CLS_WRK_NYU_FALL$chk': current_only,
}
data = urllib.urlencode(params)
r = self.opener.open(self.scrape_url, data)
page = r.read()
soup = BeautifulSoup.BeautifulSoup(page)
ICStateNum = 1 # default
if page.find("This search will return more than 100 results and may take a while to process.") > 0:
print "confirming long listing..."
#time.sleep(4)
self.longs.append(id)
ICStateNum = int(soup.find('input', {'type': 'hidden', 'name': 'ICStateNum'})['value'])
print 'ICStateNum found: %s' % ICStateNum
soup = self._confirm_long_listing(ICStateNum)
print "long listing retrieved."
if not self.section_data.has_key(id):
self.section_data[id] = {}
self._list_courses_in_section(id, soup, current_only)
print "%s classes gathered from %s" % (len(self.section_data[id].keys()), id)
self.last_section = id
return ICStateNum
def _list_courses_in_section(self, id, soup, current_only):
ret = {}
classes = soup.findAll('span',
{'style': "background-color: white; font-family: arial; color: black; font-size: 16px; font-weight: normal"})
for c in classes:
try:
bits = [i.strip() for i in re.split(' (\d+)', c.find('b').contents[0])]
classification = bits[0]
number = bits[1]
name = " ".join(bits[2:])
except:
classification, number, name = None, None, None
description, meta, college, units, level = '', '', '', '', ''
try:
description = c.find('div', {'class': 'courseDescription'}).find('p').contents[0]
except: pass
try:
meta = c.findNext('span', {'class': 'SSSTEXTBLUE'}).contents[0].split('|')
except: pass
try:
college = meta[0].strip()
except: pass
try:
units = meta[1].split(':')[1].strip()
except: pass
try:
level = meta[2].split(':')[1].strip()
except: pass
try:
index = c.findNext('td', {'class': 'SSSGROUPBOXRIGHTLABEL'}).findNext('a')['name'].split('$')[1]
except:
# when there is a "more description" link
index = None
self.section_data[id]["%s-%s" % (classification, number)] = {
'classification': classification,
'name': name,
'description': description,
'number': number,
'college': college,
'units': units,
'level': level,
'index': index}
def process_section(self, id):
try:
id = self.sections[id][0]
except:
return
hold = len(self.classes.keys())
ICStateNum = self.get_section_listing(id) + 1
keys = {}
for key in self.section_data[id].keys():
# uniquify it
keys[key] = 1
total = len(keys.keys())
for i, key in enumerate(sorted(keys.keys())):
ind = self.section_data[id][key]['index']
if ind:
print " grabbing course %s of %s..." % (i + 1, total)
self.get_detail_for_course_in_section(id, ind, ICStateNum)
print "%s classes processed" % (len(self.classes.keys()) - hold)
print "%s total classes scraped" % len(self.classes.keys())
f = open('%s%s.txt' % (self.DUMP_DIR, id), 'w')
pickle.dump(self.classes, f)
f.close()
self.classes = {}
# django helper method, loads to db
#load_listing(id)
def get_detail_for_course_in_section(self, id, ind, ICStateNum):
params = {
'ICAction': 'NYU_CLS_DERIVED_TERM$' + ind,
'ICChanged': '-1',
'ICElementNum': '0',
'ICFocus': '',
'ICResubmit': '0',
'ICSID': self.ICSID,
'ICSaveWarningFilter': '0',
'ICStateNum': "%s" % ICStateNum,
'ICType': 'Panel',
'ICXPos': '0',
'ICYPos': '1332',
'NYU_CLS_DERIVED_DESCR100': '',
'NYU_CLS_DERIVED_DESCR100_JOB_POST1': '',
'NYU_CLS_WRK_NYU_FALL$chk': 'N',
}
data = urllib.urlencode(params)
r = self.opener.open(self.scrape_url, data)
page = r.read()
soup = BeautifulSoup.BeautifulSoup(page)
try:
self.summarize_detail(id, soup)
except:
print "fail"
def summarize_detail(self, id, soup):
tables = soup.find('table', {'class': 'PSLEVEL3SCROLLAREABODY'}).findAll('table', {'class': 'PSGROUPBOX', 'width': '531'})
for table in tables:
notes = ''
datacell = table.find('td', {'style': 'background-color: white; font-family: arial; font-size: 12px;'})
self.datacell = datacell
if ("%s" % datacell).find("<b>Topic: </b>") > 0:
# Topics class. Special case.
classification, number = re.split('\s+',
re.search('([-\w]+\s+\d+)', "%s" % datacell.find('p')).groups()[0])
class_name = re.search('<b>Topic: </b>(.*)<p>', "%s" % datacell).groups()[0].strip()
else:
# Normal case.
bits = re.split('\s+', re.search('([-\w\s.]+\s+\d+)', "%s" % datacell).groups()[0])
number = bits[-1]
classification = " ".join(bits[:-1])
class_name = ''
try:
units = re.search('([\w]+( - )?[\w]+ units)', "%s" % datacell).groups()[0]
except:
try:
# leftover units from previous session of same class, use
if units:
pass
else:
units = ""
except:
units = self.section_data[id]["%s-%s" % (classification, number)]['units'],
for i in datacell.findAll('span'):
if i.contents:
if i.contents[0].__unicode__() == "Class#:":
classnum = i.nextSibling.strip(' |')
if i.contents[0].__unicode__() == "Session:":
session = i.nextSibling.strip(' |')
if i.contents[0].__unicode__() == "Section:":
section = i.nextSibling.strip(' |\r\n')
if i.contents[0].__unicode__() == "Class Status:":
is_open = i.nextSibling.nextSibling.contents[0]
elif i.contents[0].__unicode__() == "Grading:":
grading = i.nextSibling.strip()
elif i.contents[0].__unicode__() == "<b>Course Location Code: </b>":
loccode = i.nextSibling.strip(' |')
component = i.nextSibling.nextSibling.nextSibling.strip()
elif i.contents[0].__unicode__() == "<b>Notes: </b>":
notes = i.nextSibling.strip()
try:
meet_data = re.search('(\d{2}/\d{2}/\d{4}[^<]*(at[^<]*)?(with[^<]*)?)\r', "%s" % datacell).groups()[0].strip(' \r\n')
except:
self.save = datacell
self.classes[classnum] = {
'classification': "%s" % classification,
'number': "%s" % number,
'is_open': "%s" % is_open,
'session': "%s" % session,
'section': "%s" % section,
'grading': "%s" % grading,
'loc_code': "%s" % loccode,
'component': "%s" % component,
'meet_data': "%s" % meet_data,
'notes': "%s" % notes,
'level': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['level'],
'course_name': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['name'],
'college': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['college'],
'description': "%s" % self.section_data[id]["%s-%s" % (classification, number)]['description'],
'class_name': "%s" % class_name,
'units': "%s" % units,
}
return True
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment