Created
April 2, 2014 07:10
-
-
Save nhanb/9929279 to your computer and use it in GitHub Desktop.
Scrape RMIT courses
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as bs | |
import re | |
import json | |
import requests | |
# the "Cookie" header when requesting start_page | |
start_page_cookie = 'rmit_logout=1....' | |
# "Cookie" header when requesting sth like rmit.edu.au/courses/039986 | |
detail_page_cookie = 'BIGipServer~Misc~webpub.rmit.....' | |
# Where you wanna store the result | |
full_file = '/tmp/full.json' | |
start_page = 'https://online.rmit.edu.vn/enrolmentcopy' | |
full_data = {} | |
def get(url, cookie): | |
r = requests.get(url, headers={'Cookie': cookie}) | |
return r.content | |
# Get course codes and urls to basic info pages | |
start_html = get(start_page, start_page_cookie) | |
soup = bs(start_html) | |
tables = soup.find_all('table')[1:] | |
for table in tables: | |
hrefs = table.findAll('a', href=True) | |
for tag in hrefs: | |
full_data[tag.text] = {'basicUrl': tag['href']} | |
viet_regex = re.compile('(Viet(\d) (\d{4}))') | |
for course_code, data in full_data.items(): | |
basicUrl = data['basicUrl'] | |
print(basicUrl) | |
# Request basic info page | |
html = get(basicUrl, detail_page_cookie) | |
courses = bs(html).find('table') | |
viet_courses = viet_regex.findall(str(courses)) | |
# Get the latest Viet course | |
latest = (viet_courses[0][2], viet_courses[0][1]) # [year, sem] | |
for match in viet_courses[1:]: | |
to_compare = (match[2], match[1]) | |
if to_compare > latest: | |
latest = to_compare | |
latest_course_str = 'Viet%s %s' % (latest[1], latest[0]) | |
print(latest_course_str) | |
detailUrl = courses.find('a', href=True, text=latest_course_str)['href'] | |
pieces = ['http://www.rmit.edu.au'] + detailUrl.split('%2F')[1:] | |
detailUrl = '/'.join(pieces) | |
full_data[course_code]['detailUrl'] = detailUrl | |
# Request detailed info | |
html = get(full_data[course_code]['detailUrl'], detail_page_cookie) | |
print (full_data[course_code]['detailUrl']) | |
soup = bs(html) | |
full_data[course_code]['description'] = str(soup.find('div', role='main')) | |
with open(full_file, 'w') as f: | |
f.write(json.dumps(full_data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment