Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from bs4 import BeautifulSoup
import requests
import json
# Caching
CACHE_FNAME = 'cache.json'
try:
cache_file = open(CACHE_FNAME, 'r')
cache_contents = cache_file.read()
CACHE_DICTION = json.loads(cache_contents)
cache_file.close()
except:
CACHE_DICTION = {}
def get_unique_key(url):
return url
def make_request_using_cache(url):
global header # Access the header and pass it
unique_ident = get_unique_key(url)
if unique_ident in CACHE_DICTION:
print("Getting cached data...")
return CACHE_DICTION[unique_ident]
else:
print("Making a request for new data...")
# Make the request and cache the new data
resp = requests.get(url, headers=header)
CACHE_DICTION[unique_ident] = resp.text # Only store the html
dumped_json_cache = json.dumps(CACHE_DICTION) # Use the json file for the dic (we're not dealing with json but html)
fw = open(CACHE_FNAME,"w")
fw.write(dumped_json_cache)
fw.close() # Close the open file
return CACHE_DICTION[unique_ident]
# Course class
class CourseListing:
def __init__(self, course_num, course_name):
self.num = course_num
self.name = course_name
def __str__(self):
str_ = self.num + ' ' + self.name + '\n\t' + self.description
return str_
def init_from_details_url(self, details_url):
global header
page_text = make_request_using_cache(details_url)
page_soup = BeautifulSoup(page_text, 'html.parser')
self.description = page_soup.find(class_='course2desc').text
baseurl = 'https://www.si.umich.edu'
catalog_url = baseurl + '/programs/courses/catalog'
header = {'User-Agent': 'SI_CLASS'}
page_text = make_request_using_cache(catalog_url)
page_soup = BeautifulSoup(page_text, 'html.parser')
content_div = page_soup.find(class_='view-content')
table_rows = content_div.find_all('tr')
course_listings = []
for i in range(20):
table_cells = table_rows[i].find_all('td')
if len(table_cells) == 2:
course_number = table_cells[0].text.strip()
course_name = table_cells[1].text.strip()
# crawl over to the details page
details_url_end = table_cells[0].find('a')['href']
details_url = baseurl + details_url_end
course_listing = CourseListing(course_number, course_name)
course_listing.init_from_details_url(details_url)
course_listings.append(course_listing)
for cl in course_listings:
print(cl)
print("-" * 20)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.