Skip to content

Instantly share code, notes, and snippets.

@danthedaniel
Created January 10, 2018 02:19
Show Gist options
  • Save danthedaniel/2e2b7a0b3a470323ce786ea9816beb89 to your computer and use it in GitHub Desktop.
Save danthedaniel/2e2b7a0b3a470323ce786ea9816beb89 to your computer and use it in GitHub Desktop.
Drexel Term Master Schedule scraper
import bs4 as bs
import sqlite3
import requests
import datetime
def db_setup(conn):
c = conn.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS classes (
subject_code TEXT,
course_num TEXT,
instr_type TEXT,
instr_method TEXT,
section TEXT,
crn INTEGER,
full BOOLEAN,
title TEXT,
days TEXT,
start_time INTEGER,
end_time INTEGER,
instructor TEXT
)""")
c.execute("CREATE INDEX IF NOT EXISTS crn ON classes (crn)")
c.execute("CREATE INDEX IF NOT EXISTS start_time ON classes (start_time)")
c.execute("CREATE INDEX IF NOT EXISTS end_time ON classes (end_time)")
conn.commit()
def get_college(href, conn):
content = requests.get(root + href).content
bullets = bs.BeautifulSoup(content, "lxml").find(class_="collegePanel")
for link in bullets.find_all("a"):
get_field(link.get("href"), conn)
def get_field(href, conn):
content = requests.get(root + href).content
sections = bs.BeautifulSoup(content, "lxml").find(class_="tableHeader")
# Filter out stray strings
sections = [x for x in sections.next_siblings if x.__class__ is bs.element.Tag]
for section in sections:
get_section(section, conn)
def get_section(section, conn):
c = conn.cursor()
cells = section.find_all("td")
if len(cells) != 11:
return
is_full = cells[5].find("p").attrs["title"] == "FULL"
time_start, time_end = get_time_range(cells[9].get_text())
c.execute(
"INSERT INTO classes VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
[
cells[0].get_text(),
cells[1].get_text(),
cells[2].get_text(),
cells[3].get_text(),
cells[4].get_text(),
int(cells[5].get_text()),
is_full,
cells[6].get_text(),
cells[8].get_text(),
time_start,
time_end,
cells[10].get_text()
]
)
conn.commit()
print("{}{} - {}".format(
cells[0].get_text(),
cells[1].get_text(),
cells[4].get_text()
))
def get_time_range(times):
try:
time_range = times.split(" - ")
time_start = datetime.datetime.strptime(time_range[0], "%I:%M %p").time()
time_end = datetime.datetime.strptime(time_range[1], "%I:%M %p").time()
return (time_as_minutes(time_start), time_as_minutes(time_end))
except ValueError:
return (None, None)
def time_as_minutes(time):
return time.hour * 60 + time.minute
root = "https://duapp2.drexel.edu"
start_url = "/webtms_du/app?component=quarterTermDetails&page=Home&service=direct&sp=ZH4sIAAAAAAAAAFvzloG1uIhBPjWlVC%2BlKLUiNUcvs6hErzw1qSS3WC8lsSRRLyS1KJcBAhiZGJh9GNgTk0tCMnNTSxhEfLISyxL1iwtz9EECxSWJuQXWPgwcJUAtzvkpQBVCEBU5iXnp%2BsElRZl56TB5l9Ti5EKGOgamioKCEgY2IwNDcyNToJHhmXlAaYXA0sQiEG1ormtoAQB4K9nSpgAAAA%3D%3D"
start_content = requests.get(root + start_url).content
sidebar = bs.BeautifulSoup(start_content, "lxml").find(id="sideLeft")
# Connect to db
conn = sqlite3.connect('tms.sqlite')
db_setup(conn)
for link in sidebar.find_all("a"):
get_college(link.get("href"), conn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment