Last active
March 8, 2024 18:07
-
-
Save laramiel/38648acf2a0f8a0383280e6f645d154a to your computer and use it in GitHub Desktop.
A python script to fetch available class slots for WWU schedule generation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# | |
''' | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
WWU Schedule Generator | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
This requires python; it's probably easiest to run on linux or WSL. | |
First install the prerequisites: | |
$ pip3 install BeautifulSoup4 pandas html5lib ortools lxml | |
Then execute the script using python3: | |
$ python3 wwu_schedule.py 'CSCI 145' 'ENG 101' 'PHYS 161' | |
The script will attempt to generate all possible schedules for each of | |
the listed classes. | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
Options include: | |
--term indicates the term. This should be year followed by the semester code: | |
Winter 10 | |
Spring 20 | |
Summer 30 | |
Fall 40 | |
--year <academic year> Specify the academic year. | |
This should be the last two digits of the two consecutive years in which | |
the term falls, such as '2324'. | |
--limit <number> Number of scheduled printed. | |
By default prints all possible schedules. | |
--include (-i) <section> | |
Includes the specific class section (course number). | |
--exclude (-e) <section> | |
Excludes the specific class section (course number). | |
--preferred-time <time spec> | |
Sets a preferred time range used to score schedules. | |
Classes staring before or after this range are penalized. | |
Example: "9:00-3:30 pm" | |
-a, -b, -c, -d <course> | |
Out of courses specified by -a, only one section will be scheduled. | |
Likewise for -b/-c/-d. | |
The simplest scheduled generator might be for two classes, like: | |
$ python wwu_schedule.py ENG_101 MATH_125 --limit 0 | |
If more class selection is useful, increase the limit: | |
$ python wwu_schedule.py ENG_101 MATH_125 --limit 5 | |
An even more complex invocation, which prefers 10:00 am classes | |
and needs to exclude some sections, might look like this: | |
$ python wwu_schedule.py PHYS_162 MATH_125 ENG_201 \ | |
--preferred-time "10:00-2:15 pm" \ | |
-i 10655 -e 10232 --limit 3 | |
To generate a class list with the following combination of classes: | |
PHYS_163 | |
HIST_112 | |
Any of CSCI 241, 247, or 301 | |
Any of COMM 224 or 235 | |
$ python wwu_schedule1.py PHYS_163 HIST_112 \ | |
-a CSCI_241 CSCI_301 CSCI_247 \ | |
-b COMM_235 -b COMM_224 | |
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
''' | |
import requests | |
import re | |
import pandas as pd | |
import lxml | |
import html5lib | |
import datetime as dt | |
import argparse | |
import sys | |
import io | |
from ortools.sat.python import cp_model | |
from bs4 import BeautifulSoup | |
from requests.adapters import HTTPAdapter | |
from requests.packages.urllib3.util.retry import Retry | |
URI = 'https://web4u.banner.wwu.edu/pls/wwis/wwskcfnd.TimeTable' | |
# WWU courses are identified by a 3- or 4- letter department prefix and a 3 digit | |
# catalog number. | |
COURSE_RE = re.compile( | |
r'(' | |
r'ACCT|AHE|AMST|ASLC|ANTH|ARAB|ART|A/HI|AECI|AUAP|ASTR|BNS|BIOL|BUS|C/AM|CHEM|CHIN|' | |
r'CLST|CD|CSEC|CSD|COMM|C2C|CSCI|CISS|DISA|DNC|DATA|DSCI|DSGN|DIAD|ECE|EAST|ECON|EDUC|' | |
r'EDAD|ESJ|EECE|ELED|ENRG|ENGR|ENG|ENTR|ESCI|ENVS|EUS|EXCE|FAIR|FIN|FREN|GEOL|GERM|GRAD|' | |
r'GREK|HLED|HIST|HGST|HNRS|HRM|HSP|HUMA|ID|IT|IEP|IBUS|INTL|ITAL|JAPN|JOUR|KIN|LAT|LDST|' | |
r'LIBR|LING|MGMT|MIS|MFGE|MACS|MKTG|MBA|MPAC|MSCI|MATH|M/CS|LANG|MDS|MLE|MUS|NURS|OPS|' | |
r'PHIL|PA|PE|PEH|PHYS|PLSC|PME|PORT|PSY|RECR|RC|REL|RUSS|SALI|SCED|SEC|SMNR|SOC|SPAN|SPED|' | |
r'SUST|TEOP|TESL|THTR|UEPP|WGSS' | |
r')\s+(\d\d\d)(\s|$)') | |
HDRS = [ | |
'Term', | |
'Crn', # 1 (section) | |
'Days', # 2 | |
'Time', # 3 | |
'Instructor', | |
'Room', | |
'Addl Fees', | |
'Cap', # 7 | |
'Enrl', # 8 | |
'Avail', # 9 | |
'Waitlist', | |
'Restrictions', | |
'Attributes' | |
] | |
def _get_session(): | |
s = requests.Session() | |
retry = Retry(connect=10, read=10, backoff_factor=0.2) | |
adapter = HTTPAdapter(max_retries=retry) | |
s.mount('http://', adapter) | |
s.mount('https://', adapter) | |
return s | |
def _to_str(x): | |
if isinstance(x, str): | |
return x | |
x = str(x) | |
return x.replace('nan', '') | |
# TODO: Use a time interval parser. | |
def class_time_to_minute_intervals(input): | |
input = input.strip().lower() | |
start,end = input.split('-') | |
start_h, start_m = start.split(':') | |
start_h, start_m = int(start_h), int(start_m) | |
end_h, end_m = end.split(' ')[0].split(':') | |
end_h, end_m = int(end_h), int(end_m) | |
if input.endswith('pm'): | |
if end_h < 8: end_h += 12 | |
if start_h < 8: start_h += 12 | |
return (start_h * 60 + start_m, end_h * 60 + end_m) | |
# start_h | |
class SchedulePrinter(cp_model.CpSolverSolutionCallback): | |
"""Print valid schedules.""" | |
def __init__(self, courses, course_vars, objective, limit): | |
cp_model.CpSolverSolutionCallback.__init__(self) | |
self.__courses = courses | |
self.__course_vars = course_vars | |
self.__solution_count = 0 | |
self.__objective = objective | |
self.__solutions = [] | |
self.__limit = limit | |
def solution_count(self): | |
return self.__solution_count | |
def on_solution_callback(self): | |
self.__solution_count += 1 | |
score = self.Value(self.__objective) | |
txt = '' | |
for t, v in self.__course_vars.items(): | |
if self.Value(v): | |
txt += f'\n{t[0]} section {t[1]}\n' | |
for days, times in self.__courses[t[0]][t[1]].items(): | |
days = f' {days}' | |
txt += f'{days[len(days)-6:]} {times}\n' | |
self.__solutions.append((score, txt)) | |
def print_all(self): | |
print('~' * 40) | |
print(f'Possible schedules ({self.__solution_count})') | |
i = 0 | |
for x in sorted(self.__solutions, key = lambda x: x[0]): | |
print('~' * 40) | |
print(f'Score: {x[0]}\n{x[1]}') | |
i = i + 1 | |
if self.__limit and i >= self.__limit: | |
break | |
def generate_schedule(courses, course_groups, preferred_time, limit): | |
# Uses Google ortools constraint solver to generate possible schedules. | |
# See: | |
# https://developers.google.com/optimization/reference/python/index_python | |
# http://www.hakank.org/google_or_tools/ | |
# | |
# courses is a dict[class][section][dayspec] = timespec, like: | |
# { | |
# 'CSCI 145' : { | |
# '44138': { 'MWF': '11:00-11:50 am', 'T': '12:00-01:50 pm'}, | |
# '41690': { 'MWF': '10:00-10:50 am', 'W': '12:00-01:50 pm'}}, | |
# | |
# 'PHYS 161' : { | |
# '41569': { 'MWRF': '11:30-12:50 pm', 'T': '04:00-05:50 pm' }}, | |
# } | |
# | |
course_vars = {} | |
interval_vars = { | |
'M': [], | |
'T': [], | |
'W': [], | |
'R': [], | |
'F': [], | |
} | |
objective = 0 | |
# Add a time-preference objective function. Classes staring before or | |
# ending after the time range get lower suitability scores. | |
p_time = (570, 915) # 9:30 - 3:15 | |
if preferred_time: | |
p_time = class_time_to_minute_intervals(preferred_time) | |
course_group_vars = {} | |
model = cp_model.CpModel() | |
for course, sections in courses.items(): | |
c = course.lower().replace(' ', '_') | |
section_vars = [] | |
for section, times in sections.items(): | |
# A boolean variable for each (class, section), indicating the section that is chosen. | |
var_name = f'selected_{c}_{section}' | |
var = model.NewBoolVar(var_name) | |
course_vars[(course, section)] = var | |
section_vars.append(var) | |
# An optional interval variable for each (class, section, day) covering the class time | |
# and controlled by the (class, section) boolean. | |
for dayspec, timespec in times.items(): | |
intervals = class_time_to_minute_intervals(timespec) | |
interval_size = intervals[1] - intervals[0] | |
for d in dayspec: | |
intvar = model.NewOptionalIntervalVar(intervals[0], interval_size, intervals[1], var, f'time_{c}_{section}_{d}') | |
interval_vars[d].append(intvar) | |
# Scoring: Each class session counts as 1. | |
# Each minute before the interval or after the interval counts as 1. | |
objective += var | |
if intervals[0] < p_time[0]: | |
objective += (max(2, p_time[0]-intervals[0]) * var) | |
if intervals[1] > p_time[1]: | |
objective += (max(2, intervals[1]-p_time[1]) * var) | |
# A constraint for each class that says exactly one section is chosen (sum of booleans = 1). | |
for opt_k, opt_s in course_groups.items(): | |
if course in opt_s: | |
course_group_vars.setdefault(opt_k, []).extend(section_vars) | |
# At most 1 course is selected from each group. | |
for opt_s in course_group_vars.values(): | |
model.Add(sum(opt_s) == 1) | |
# A NoOverlap constraint for all the optional interrvals. | |
for d, v in interval_vars.items(): | |
if v: | |
model.AddNoOverlap(v) | |
if limit is not None and limit == 0: | |
model.Minimize(objective) | |
# Now solve... | |
printer = SchedulePrinter(courses, course_vars, objective, limit) | |
solver = cp_model.CpSolver() | |
solver.parameters.enumerate_all_solutions = True | |
status = solver.Solve(model, printer) | |
printer.print_all() | |
return printer.solution_count() | |
def course_list(curr_yr, term, subject): | |
# Read the course list from the web; is there a cleaner location | |
# to get it? Parse the web page here: | |
# | |
# https://web4u.banner.wwu.edu/pls/wwis/wwskcfnd.TimeTable | |
# | |
# <select id='term' name='term'> | |
# default: 'All' | |
# format <YEAR><TERM> | |
# | |
# <INPUT TYPE='hidden' NAME='curr_yr' VALUE='2324'> | |
# | |
# <select id='subj' name='subj'> | |
r = _get_session().post(URI, data={ | |
'term': term, | |
'curr_yr': curr_yr, | |
'subj': subject, | |
}) | |
r.raise_for_status() | |
result = {} | |
# Like most HTML parsing, this is a hack, as the webpage | |
# uses tables for formatting. | |
# | |
# The strategy is to parse each table separately using bs4, | |
# convert it to a pandas table, extract whether this is the | |
# first row of a | |
soup = BeautifulSoup(r.text, features='lxml') | |
tables = soup.findAll('table') | |
is_data = False | |
key = '' | |
desc = '' | |
for table in tables: | |
if table.findParent('table') is None: | |
try: | |
t = pd.read_html(io.StringIO(str(table)), thousands=None) | |
if len(t) != 1: | |
continue | |
df = t[0] | |
except: | |
continue | |
if len(df.columns) >= len(HDRS) - 1 and len(df.columns) <= len(HDRS): | |
# This 'table' is likely a row in the course schedule. | |
# (the site uses bad HTML formatting, where each row is a separate table element) | |
# Lab/additional sections may have fewer html columns. | |
is_data = True | |
if df[0][0] == HDRS[0]: | |
continue # header row | |
else: | |
# This 'table' is likely to be merely a formatting construct in the webpage. | |
# It may be a class header row, which includes the class name, description, etc. | |
# detect this using regular expressions. | |
is_data = False | |
tmp = str(df[0][0]).strip() | |
m = COURSE_RE.match(tmp) | |
if m and tmp.startswith(subject): | |
# course name found | |
desc = tmp | |
key = f'{m.group(1).strip()} {m.group(2)}' | |
else: | |
key = '' | |
desc = '' | |
# For data tables, append them to the existing schedule table. | |
if is_data and key: | |
# Convert float -> empty string values. | |
for x in range(len(df.columns)): | |
df[x] = df[x].apply(_to_str) | |
if key not in result: | |
result[key] = df | |
continue | |
existing = result[key] | |
if len(df.columns) < len(HDRS): | |
# This is something like a lab section, or alternate meeting time on other days. | |
# In any case, fill in the remaining columns with empty string rather than NaN. | |
while len(df.columns) < len(HDRS): | |
df[len(df.columns)] = '' | |
# copy course number | |
df[1] = existing.iloc[-1][1] | |
# copy availability | |
df[7] = existing.iloc[-1][7] | |
df[8] = existing.iloc[-1][8] | |
df[9] = existing.iloc[-1][9] | |
result[key] = pd.concat([result[key], df], ignore_index=True) | |
return result | |
def filter_to_requested(courses, requested_classes, missing): | |
available = {} | |
for x in requested_classes: | |
if x not in courses: | |
missing.append(x) | |
continue | |
available[x] = courses[x] | |
return available | |
def filter_to_available(courses, include, exclude, missing): | |
available = {} | |
for x, df in courses.items(): | |
unique = df[1].unique() | |
# Remove 'TBD' sections | |
df = df[df[2] != 'TBD'] | |
# exclude some sections | |
for i in exclude: | |
if i in unique: | |
df = df[df[1] != i] | |
if df.empty: | |
continue | |
# include some sections | |
if 'all' in include: | |
available[x] = df | |
continue | |
# filter by available slots. | |
z = df[df[8] < df[9]] | |
for i in include: | |
if i in unique and i not in z[1].unique(): | |
z = pd.concat([z, df[df[1] == i]], ignore_index=True) | |
if z.empty: | |
missing.append(x) | |
continue | |
available[x] = z | |
return available | |
def run_class_scheduler(args, course_groups, required): | |
print('-'*40) | |
print(f'Academic year {args.year} quarter {args.term}') | |
for it in course_groups.values(): | |
print('One course from : ' + ' '.join(sorted(it))) | |
print('-'*40) | |
requested_classes = [] | |
for g, c in course_groups.items(): | |
requested_classes.extend(c) | |
# Reads the courses from WWU for the requested_classes and term. | |
subjects = set() | |
for x in requested_classes: | |
m = COURSE_RE.match(x) | |
if m: | |
subjects.add(m.group(1)) | |
courses = {} | |
for x in subjects: | |
c = course_list(args.year, args.term, x) | |
print(f'Course list for {x} has {len(c)} courses') | |
courses.update(c) | |
missing = [] | |
courses = filter_to_requested(courses, requested_classes, missing) | |
for c, df in courses.items(): | |
print('-'*40) | |
print(c) | |
print(df) | |
available = filter_to_available(courses, args.include, args.exclude, missing) | |
if missing: | |
print('-'*40) | |
print( | |
f'Courses {",".join(missing)} have no available sections; schedule may be incomplete') | |
# If any of the "required" classes are unavailable, exit early. | |
for x in missing: | |
if x in required: | |
return | |
# Transform the dataframe to a dict used by generate_schedule. | |
transformed = {} | |
for k, df in available.items(): | |
for section in df[1].unique(): | |
time_df = df[(df[1] == section)] | |
for i in range(len(time_df)): | |
# c[k,section] implies time_slots from df[2], df[3] | |
week_spec = str(time_df.iloc[i][2]).strip() | |
timespec = str(time_df.iloc[i][3]).strip() | |
if k not in transformed: | |
transformed[k] = {} | |
if section not in transformed[k]: | |
transformed[k][section] = {} | |
transformed[k][section][week_spec] = timespec | |
schedules = generate_schedule(transformed, course_groups, args.preferred_time, args.limit) | |
print('~'*40) | |
print( f'Generated {schedules} possible schedules for selected courses.') | |
print() | |
def main(argv): | |
today = dt.date.today() | |
# Determine the academic year for the scheduler, which is e.g. 2324 | |
academic_year = today.year | |
if today.month < 5: | |
academic_year = academic_year-1 | |
default_year = "%02d%02d" %(academic_year-2000, academic_year-1999) | |
# Determine the term (quarter) for the scheduler | |
# Term is composed of 'YYYY' + Suffix: | |
# May 1: Beginning of registration for FALL: 40 | |
# Nov 1: Beginning of registration for WINTER: 10 | |
# Feb 15: Beginning of registration for SPRING: 20 | |
# Summer (30) is never auto-selected. | |
if today >= dt.date(year=academic_year, month=5, day=1): | |
default_term="%04d40" % (academic_year,) | |
if today >= dt.date(year=academic_year, month=11, day=1): | |
default_term="%04d10" % (academic_year+1,) | |
if today >= dt.date(year=academic_year+1, month=2, day=15): | |
default_term="%04d20" % (academic_year+1,) | |
parser = argparse.ArgumentParser( | |
prog=argv[0], description='Attempt to generate WWU schedule.') | |
parser.add_argument('-t', '--term', type=str, nargs='?', default=default_term, | |
help="YYYYTT, where YYYY is the year, and TT is the term (Winter=10, Fall=40).") | |
parser.add_argument('-y', '--year', type=str, nargs='?', default=default_year, | |
help="School year. For example, --year=2324.") | |
parser.add_argument('-i', '--include', action='append', default=[], | |
help="Include these sections (course numbers) even they have no space.") | |
parser.add_argument('-e', '--exclude', action='append', default=[], | |
help="Exclude these sections (course numbers).") | |
parser.add_argument('--preferred-time', type=str, default=None, | |
help="Prefer classes which fall within this time range. Example: '10:00-3:30 pm'") | |
parser.add_argument('--limit', default=None, type=int, | |
help="Print the N best scores.") | |
args, unknown = parser.parse_known_args(argv[1:]) | |
failed = False | |
course_groups = {} | |
required = set() | |
mode = '-required' | |
groupnum = 0 | |
for x in unknown: | |
if x.startswith('-'): | |
x = x.lower() | |
if x in ['-limit', '-term', '-year', '-include', '-exclude', '-preferred-time']: | |
print(f'Argument should use --; -{x}', file=sys.stderr) | |
failed = True | |
if x != mode and x != '-oneof': | |
groupnum = groupnum + 1 | |
mode = x | |
continue | |
course = x.upper().replace('_', ' ') | |
if mode.startswith('-r'): | |
groupnum = groupnum + 1 | |
required.add(course) | |
course_groups.setdefault(f'{mode}-{groupnum}', set()).add(course) | |
if failed or not course_groups: | |
print('Error specifying courses.', file=sys.stderr) | |
print('Example: ', file=sys.stderr) | |
print(f' python3 {argv[0]} ENG_101 PHYS_161 HIST_112', file=sys.stderr) | |
print('', file=sys.stderr) | |
return | |
run_class_scheduler(args, course_groups, required) | |
if __name__ == '__main__': | |
main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment