Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@nhanb
Created April 2, 2014 07:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nhanb/9929279 to your computer and use it in GitHub Desktop.
Save nhanb/9929279 to your computer and use it in GitHub Desktop.
Scrape RMIT courses
from bs4 import BeautifulSoup as bs
import re
import json
import requests
# the "Cookie" header when requesting start_page
start_page_cookie = 'rmit_logout=1....'
# "Cookie" header when requesting sth like rmit.edu.au/courses/039986
detail_page_cookie = 'BIGipServer~Misc~webpub.rmit.....'
# Where you wanna store the result
full_file = '/tmp/full.json'
start_page = 'https://online.rmit.edu.vn/enrolmentcopy'
full_data = {}
def get(url, cookie):
r = requests.get(url, headers={'Cookie': cookie})
return r.content
# Get course codes and urls to basic info pages
start_html = get(start_page, start_page_cookie)
soup = bs(start_html)
tables = soup.find_all('table')[1:]
for table in tables:
hrefs = table.findAll('a', href=True)
for tag in hrefs:
full_data[tag.text] = {'basicUrl': tag['href']}
viet_regex = re.compile('(Viet(\d) (\d{4}))')
for course_code, data in full_data.items():
basicUrl = data['basicUrl']
print(basicUrl)
# Request basic info page
html = get(basicUrl, detail_page_cookie)
courses = bs(html).find('table')
viet_courses = viet_regex.findall(str(courses))
# Get the latest Viet course
latest = (viet_courses[0][2], viet_courses[0][1]) # [year, sem]
for match in viet_courses[1:]:
to_compare = (match[2], match[1])
if to_compare > latest:
latest = to_compare
latest_course_str = 'Viet%s %s' % (latest[1], latest[0])
print(latest_course_str)
detailUrl = courses.find('a', href=True, text=latest_course_str)['href']
pieces = ['http://www.rmit.edu.au'] + detailUrl.split('%2F')[1:]
detailUrl = '/'.join(pieces)
full_data[course_code]['detailUrl'] = detailUrl
# Request detailed info
html = get(full_data[course_code]['detailUrl'], detail_page_cookie)
print (full_data[course_code]['detailUrl'])
soup = bs(html)
full_data[course_code]['description'] = str(soup.find('div', role='main'))
with open(full_file, 'w') as f:
f.write(json.dumps(full_data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment