Created
January 10, 2018 02:19
-
-
Save danthedaniel/2e2b7a0b3a470323ce786ea9816beb89 to your computer and use it in GitHub Desktop.
Drexel Term Master Schedule scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 as bs | |
import sqlite3 | |
import requests | |
import datetime | |
def db_setup(conn): | |
c = conn.cursor() | |
c.execute("""CREATE TABLE IF NOT EXISTS classes ( | |
subject_code TEXT, | |
course_num TEXT, | |
instr_type TEXT, | |
instr_method TEXT, | |
section TEXT, | |
crn INTEGER, | |
full BOOLEAN, | |
title TEXT, | |
days TEXT, | |
start_time INTEGER, | |
end_time INTEGER, | |
instructor TEXT | |
)""") | |
c.execute("CREATE INDEX IF NOT EXISTS crn ON classes (crn)") | |
c.execute("CREATE INDEX IF NOT EXISTS start_time ON classes (start_time)") | |
c.execute("CREATE INDEX IF NOT EXISTS end_time ON classes (end_time)") | |
conn.commit() | |
def get_college(href, conn): | |
content = requests.get(root + href).content | |
bullets = bs.BeautifulSoup(content, "lxml").find(class_="collegePanel") | |
for link in bullets.find_all("a"): | |
get_field(link.get("href"), conn) | |
def get_field(href, conn): | |
content = requests.get(root + href).content | |
sections = bs.BeautifulSoup(content, "lxml").find(class_="tableHeader") | |
# Filter out stray strings | |
sections = [x for x in sections.next_siblings if x.__class__ is bs.element.Tag] | |
for section in sections: | |
get_section(section, conn) | |
def get_section(section, conn): | |
c = conn.cursor() | |
cells = section.find_all("td") | |
if len(cells) != 11: | |
return | |
is_full = cells[5].find("p").attrs["title"] == "FULL" | |
time_start, time_end = get_time_range(cells[9].get_text()) | |
c.execute( | |
"INSERT INTO classes VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", | |
[ | |
cells[0].get_text(), | |
cells[1].get_text(), | |
cells[2].get_text(), | |
cells[3].get_text(), | |
cells[4].get_text(), | |
int(cells[5].get_text()), | |
is_full, | |
cells[6].get_text(), | |
cells[8].get_text(), | |
time_start, | |
time_end, | |
cells[10].get_text() | |
] | |
) | |
conn.commit() | |
print("{}{} - {}".format( | |
cells[0].get_text(), | |
cells[1].get_text(), | |
cells[4].get_text() | |
)) | |
def get_time_range(times): | |
try: | |
time_range = times.split(" - ") | |
time_start = datetime.datetime.strptime(time_range[0], "%I:%M %p").time() | |
time_end = datetime.datetime.strptime(time_range[1], "%I:%M %p").time() | |
return (time_as_minutes(time_start), time_as_minutes(time_end)) | |
except ValueError: | |
return (None, None) | |
def time_as_minutes(time): | |
return time.hour * 60 + time.minute | |
root = "https://duapp2.drexel.edu" | |
start_url = "/webtms_du/app?component=quarterTermDetails&page=Home&service=direct&sp=ZH4sIAAAAAAAAAFvzloG1uIhBPjWlVC%2BlKLUiNUcvs6hErzw1qSS3WC8lsSRRLyS1KJcBAhiZGJh9GNgTk0tCMnNTSxhEfLISyxL1iwtz9EECxSWJuQXWPgwcJUAtzvkpQBVCEBU5iXnp%2BsElRZl56TB5l9Ti5EKGOgamioKCEgY2IwNDcyNToJHhmXlAaYXA0sQiEG1ormtoAQB4K9nSpgAAAA%3D%3D" | |
start_content = requests.get(root + start_url).content | |
sidebar = bs.BeautifulSoup(start_content, "lxml").find(id="sideLeft") | |
# Connect to db | |
conn = sqlite3.connect('tms.sqlite') | |
db_setup(conn) | |
for link in sidebar.find_all("a"): | |
get_college(link.get("href"), conn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment