nhanb/scrape.py

## scrape.py
from bs4 import BeautifulSoup as bs
import re
import json
import requests

# the "Cookie" header when requesting start_page
start_page_cookie = 'rmit_logout=1....'

# "Cookie" header when requesting sth like rmit.edu.au/courses/039986
detail_page_cookie = 'BIGipServer~Misc~webpub.rmit.....'

# Where you wanna store the result
full_file = '/tmp/full.json'

start_page = 'https://online.rmit.edu.vn/enrolmentcopy'
full_data = {}


def get(url, cookie):
    r = requests.get(url, headers={'Cookie': cookie})
    return r.content

# Get course codes and urls to basic info pages
start_html = get(start_page, start_page_cookie)
soup = bs(start_html)
tables = soup.find_all('table')[1:]
for table in tables:
    hrefs = table.findAll('a', href=True)
    for tag in hrefs:
        full_data[tag.text] = {'basicUrl': tag['href']}

viet_regex = re.compile('(Viet(\d) (\d{4}))')
for course_code, data in full_data.items():
    basicUrl = data['basicUrl']
    print(basicUrl)

    # Request basic info page
    html = get(basicUrl, detail_page_cookie)
    courses = bs(html).find('table')
    viet_courses = viet_regex.findall(str(courses))

    # Get the latest Viet course
    latest = (viet_courses[0][2], viet_courses[0][1])  # [year, sem]
    for match in viet_courses[1:]:
        to_compare = (match[2], match[1])
        if to_compare > latest:
            latest = to_compare
    latest_course_str = 'Viet%s %s' % (latest[1], latest[0])
    print(latest_course_str)

    detailUrl = courses.find('a', href=True, text=latest_course_str)['href']
    pieces = ['http://www.rmit.edu.au'] + detailUrl.split('%2F')[1:]
    detailUrl = '/'.join(pieces)
    full_data[course_code]['detailUrl'] = detailUrl

    # Request detailed info
    html = get(full_data[course_code]['detailUrl'], detail_page_cookie)
    print (full_data[course_code]['detailUrl'])
    soup = bs(html)
    full_data[course_code]['description'] = str(soup.find('div', role='main'))

with open(full_file, 'w') as f:
    f.write(json.dumps(full_data))
	from bs4 import BeautifulSoup as bs
	import re
	import json
	import requests

	# the "Cookie" header when requesting start_page
	start_page_cookie = 'rmit_logout=1....'

	# "Cookie" header when requesting sth like rmit.edu.au/courses/039986
	detail_page_cookie = 'BIGipServer~Misc~webpub.rmit.....'

	# Where you wanna store the result
	full_file = '/tmp/full.json'

	start_page = 'https://online.rmit.edu.vn/enrolmentcopy'
	full_data = {}


	def get(url, cookie):
	r = requests.get(url, headers={'Cookie': cookie})
	return r.content

	# Get course codes and urls to basic info pages
	start_html = get(start_page, start_page_cookie)
	soup = bs(start_html)
	tables = soup.find_all('table')[1:]
	for table in tables:
	hrefs = table.findAll('a', href=True)
	for tag in hrefs:
	full_data[tag.text] = {'basicUrl': tag['href']}

	viet_regex = re.compile('(Viet(\d) (\d{4}))')
	for course_code, data in full_data.items():
	basicUrl = data['basicUrl']
	print(basicUrl)

	# Request basic info page
	html = get(basicUrl, detail_page_cookie)
	courses = bs(html).find('table')
	viet_courses = viet_regex.findall(str(courses))

	# Get the latest Viet course
	latest = (viet_courses[0][2], viet_courses[0][1]) # [year, sem]
	for match in viet_courses[1:]:
	to_compare = (match[2], match[1])
	if to_compare > latest:
	latest = to_compare
	latest_course_str = 'Viet%s %s' % (latest[1], latest[0])
	print(latest_course_str)

	detailUrl = courses.find('a', href=True, text=latest_course_str)['href']
	pieces = ['http://www.rmit.edu.au'] + detailUrl.split('%2F')[1:]
	detailUrl = '/'.join(pieces)
	full_data[course_code]['detailUrl'] = detailUrl

	# Request detailed info
	html = get(full_data[course_code]['detailUrl'], detail_page_cookie)
	print (full_data[course_code]['detailUrl'])
	soup = bs(html)
	full_data[course_code]['description'] = str(soup.find('div', role='main'))

	with open(full_file, 'w') as f:
	f.write(json.dumps(full_data))