Skip to content

Instantly share code, notes, and snippets.

@cjauvin
Last active December 12, 2015 07:28
Show Gist options
  • Save cjauvin/4736441 to your computer and use it in GitHub Desktop.
Save cjauvin/4736441 to your computer and use it in GitHub Desktop.
import requests, cookielib, time, re, os, sys
n = 10
nimbus_url = 'https://nimbus.mcgill.ca'
cj = cookielib.MozillaCookieJar()
cj.load('cookies.txt', ignore_expires=True)
for cookie in cj:
cookie.expires = time.time() + 14 * 24 * 3600
try:
os.system('mkdir -p results/Epidemiology/MSc')
os.system('mkdir -p results/Epidemiology/PhD')
os.system('mkdir -p results/Biostatistics/MSc')
os.system('mkdir -p results/Biostatistics/PhD')
os.system('mkdir -p results/Public\ Health/MSc')
except: pass
# master = open('master.txt', 'w')
# for i in range(1):
# print i
# payload = {'dtid': 'z_42m', 'cmd_0': 'onPaging', 'uuid_0': 'gAAPr1', 'data_0': '{"": %d}' % i}
# r = requests.post(nimbus_url + '/gapR0/zkau', cookies=cj, data=payload)
# #print r.headers['content-type']
# master.write(r.content + '\n<master_page>\n')
# master.close()
master = ''.join(open('master.txt').readlines())
app_blocks = master.split('href:')[1:]
print 'found %d applicants' % len(app_blocks)
for app_block in app_blocks[:n]:
app_url = re.match("'(.*?)'", app_block).group(1)
mcgill_id = re.search('mcGillId=(\d+)', app_url).group(1)
vals = re.findall("{(value:)?'?(.*?)'?}", app_block)
lastname = vals[0][1]
application_no = vals[3][1]
program = vals[6][1][:3]
assert program in ['MSc', 'PhD'], program
major = re.match('(.*) -[TN]', vals[7][1]).group(1).replace(' ', '\ ')
assert major in ['Epidemiology', 'Biostatistics', 'Public\ Health'], major
r = requests.get(nimbus_url + app_url, cookies=cj)
#print r.headers['content-type']
app_doc_urls = re.findall("href:'(/gapR0/bannerApplication/streamDocumentum/[?]documentId=\w+?)'", r.content)
sys.stdout.write('downloading %d docs for %s' % (len(app_doc_urls), mcgill_id))
sys.stdout.flush()
for j, app_doc_url in enumerate(app_doc_urls):
r = requests.get(nimbus_url + app_doc_url, cookies=cj)
#print r.headers['content-type']
sys.stdout.write('.')
sys.stdout.flush()
open('pdf_merge_tmp/%d.pdf' % j, 'wb').write(r.content)
fn = 'results/%s/%s/%s_%s_%s.pdf' % (major, program, lastname, mcgill_id, application_no)
print 'merging and saving %s' % fn
os.system('pdftk pdf_merge_tmp/*.pdf cat output %s' % fn)
os.system('rm -fr ./pdf_merge_tmp/*.pdf')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment