Skip to content

Instantly share code, notes, and snippets.

@laramiel
Last active March 8, 2024 18:07
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save laramiel/38648acf2a0f8a0383280e6f645d154a to your computer and use it in GitHub Desktop.
Save laramiel/38648acf2a0f8a0383280e6f645d154a to your computer and use it in GitHub Desktop.
A python script to fetch available class slots for WWU schedule generation
#!/usr/bin/env python3
#
'''
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
WWU Schedule Generator
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This requires python; it's probably easiest to run on linux or WSL.
First install the prerequisites:
$ pip3 install BeautifulSoup4 pandas html5lib ortools lxml
Then execute the script using python3:
$ python3 wwu_schedule.py 'CSCI 145' 'ENG 101' 'PHYS 161'
The script will attempt to generate all possible schedules for each of
the listed classes.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Options include:
--term indicates the term. This should be year followed by the semester code:
Winter 10
Spring 20
Summer 30
Fall 40
--year <academic year> Specify the academic year.
This should be the last two digits of the two consecutive years in which
the term falls, such as '2324'.
--limit <number> Number of scheduled printed.
By default prints all possible schedules.
--include (-i) <section>
Includes the specific class section (course number).
--exclude (-e) <section>
Excludes the specific class section (course number).
--preferred-time <time spec>
Sets a preferred time range used to score schedules.
Classes staring before or after this range are penalized.
Example: "9:00-3:30 pm"
-a, -b, -c, -d <course>
Out of courses specified by -a, only one section will be scheduled.
Likewise for -b/-c/-d.
The simplest scheduled generator might be for two classes, like:
$ python wwu_schedule.py ENG_101 MATH_125 --limit 0
If more class selection is useful, increase the limit:
$ python wwu_schedule.py ENG_101 MATH_125 --limit 5
An even more complex invocation, which prefers 10:00 am classes
and needs to exclude some sections, might look like this:
$ python wwu_schedule.py PHYS_162 MATH_125 ENG_201 \
--preferred-time "10:00-2:15 pm" \
-i 10655 -e 10232 --limit 3
To generate a class list with the following combination of classes:
PHYS_163
HIST_112
Any of CSCI 241, 247, or 301
Any of COMM 224 or 235
$ python wwu_schedule1.py PHYS_163 HIST_112 \
-a CSCI_241 CSCI_301 CSCI_247 \
-b COMM_235 -b COMM_224
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
'''
import requests
import re
import pandas as pd
import lxml
import html5lib
import datetime as dt
import argparse
import sys
import io
from ortools.sat.python import cp_model
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
URI = 'https://web4u.banner.wwu.edu/pls/wwis/wwskcfnd.TimeTable'
# WWU courses are identified by a 3- or 4- letter department prefix and a 3 digit
# catalog number.
COURSE_RE = re.compile(
r'('
r'ACCT|AHE|AMST|ASLC|ANTH|ARAB|ART|A/HI|AECI|AUAP|ASTR|BNS|BIOL|BUS|C/AM|CHEM|CHIN|'
r'CLST|CD|CSEC|CSD|COMM|C2C|CSCI|CISS|DISA|DNC|DATA|DSCI|DSGN|DIAD|ECE|EAST|ECON|EDUC|'
r'EDAD|ESJ|EECE|ELED|ENRG|ENGR|ENG|ENTR|ESCI|ENVS|EUS|EXCE|FAIR|FIN|FREN|GEOL|GERM|GRAD|'
r'GREK|HLED|HIST|HGST|HNRS|HRM|HSP|HUMA|ID|IT|IEP|IBUS|INTL|ITAL|JAPN|JOUR|KIN|LAT|LDST|'
r'LIBR|LING|MGMT|MIS|MFGE|MACS|MKTG|MBA|MPAC|MSCI|MATH|M/CS|LANG|MDS|MLE|MUS|NURS|OPS|'
r'PHIL|PA|PE|PEH|PHYS|PLSC|PME|PORT|PSY|RECR|RC|REL|RUSS|SALI|SCED|SEC|SMNR|SOC|SPAN|SPED|'
r'SUST|TEOP|TESL|THTR|UEPP|WGSS'
r')\s+(\d\d\d)(\s|$)')
HDRS = [
'Term',
'Crn', # 1 (section)
'Days', # 2
'Time', # 3
'Instructor',
'Room',
'Addl Fees',
'Cap', # 7
'Enrl', # 8
'Avail', # 9
'Waitlist',
'Restrictions',
'Attributes'
]
def _get_session():
s = requests.Session()
retry = Retry(connect=10, read=10, backoff_factor=0.2)
adapter = HTTPAdapter(max_retries=retry)
s.mount('http://', adapter)
s.mount('https://', adapter)
return s
def _to_str(x):
if isinstance(x, str):
return x
x = str(x)
return x.replace('nan', '')
# TODO: Use a time interval parser.
def class_time_to_minute_intervals(input):
input = input.strip().lower()
start,end = input.split('-')
start_h, start_m = start.split(':')
start_h, start_m = int(start_h), int(start_m)
end_h, end_m = end.split(' ')[0].split(':')
end_h, end_m = int(end_h), int(end_m)
if input.endswith('pm'):
if end_h < 8: end_h += 12
if start_h < 8: start_h += 12
return (start_h * 60 + start_m, end_h * 60 + end_m)
# start_h
class SchedulePrinter(cp_model.CpSolverSolutionCallback):
"""Print valid schedules."""
def __init__(self, courses, course_vars, objective, limit):
cp_model.CpSolverSolutionCallback.__init__(self)
self.__courses = courses
self.__course_vars = course_vars
self.__solution_count = 0
self.__objective = objective
self.__solutions = []
self.__limit = limit
def solution_count(self):
return self.__solution_count
def on_solution_callback(self):
self.__solution_count += 1
score = self.Value(self.__objective)
txt = ''
for t, v in self.__course_vars.items():
if self.Value(v):
txt += f'\n{t[0]} section {t[1]}\n'
for days, times in self.__courses[t[0]][t[1]].items():
days = f' {days}'
txt += f'{days[len(days)-6:]} {times}\n'
self.__solutions.append((score, txt))
def print_all(self):
print('~' * 40)
print(f'Possible schedules ({self.__solution_count})')
i = 0
for x in sorted(self.__solutions, key = lambda x: x[0]):
print('~' * 40)
print(f'Score: {x[0]}\n{x[1]}')
i = i + 1
if self.__limit and i >= self.__limit:
break
def generate_schedule(courses, course_groups, preferred_time, limit):
# Uses Google ortools constraint solver to generate possible schedules.
# See:
# https://developers.google.com/optimization/reference/python/index_python
# http://www.hakank.org/google_or_tools/
#
# courses is a dict[class][section][dayspec] = timespec, like:
# {
# 'CSCI 145' : {
# '44138': { 'MWF': '11:00-11:50 am', 'T': '12:00-01:50 pm'},
# '41690': { 'MWF': '10:00-10:50 am', 'W': '12:00-01:50 pm'}},
#
# 'PHYS 161' : {
# '41569': { 'MWRF': '11:30-12:50 pm', 'T': '04:00-05:50 pm' }},
# }
#
course_vars = {}
interval_vars = {
'M': [],
'T': [],
'W': [],
'R': [],
'F': [],
}
objective = 0
# Add a time-preference objective function. Classes staring before or
# ending after the time range get lower suitability scores.
p_time = (570, 915) # 9:30 - 3:15
if preferred_time:
p_time = class_time_to_minute_intervals(preferred_time)
course_group_vars = {}
model = cp_model.CpModel()
for course, sections in courses.items():
c = course.lower().replace(' ', '_')
section_vars = []
for section, times in sections.items():
# A boolean variable for each (class, section), indicating the section that is chosen.
var_name = f'selected_{c}_{section}'
var = model.NewBoolVar(var_name)
course_vars[(course, section)] = var
section_vars.append(var)
# An optional interval variable for each (class, section, day) covering the class time
# and controlled by the (class, section) boolean.
for dayspec, timespec in times.items():
intervals = class_time_to_minute_intervals(timespec)
interval_size = intervals[1] - intervals[0]
for d in dayspec:
intvar = model.NewOptionalIntervalVar(intervals[0], interval_size, intervals[1], var, f'time_{c}_{section}_{d}')
interval_vars[d].append(intvar)
# Scoring: Each class session counts as 1.
# Each minute before the interval or after the interval counts as 1.
objective += var
if intervals[0] < p_time[0]:
objective += (max(2, p_time[0]-intervals[0]) * var)
if intervals[1] > p_time[1]:
objective += (max(2, intervals[1]-p_time[1]) * var)
# A constraint for each class that says exactly one section is chosen (sum of booleans = 1).
for opt_k, opt_s in course_groups.items():
if course in opt_s:
course_group_vars.setdefault(opt_k, []).extend(section_vars)
# At most 1 course is selected from each group.
for opt_s in course_group_vars.values():
model.Add(sum(opt_s) == 1)
# A NoOverlap constraint for all the optional interrvals.
for d, v in interval_vars.items():
if v:
model.AddNoOverlap(v)
if limit is not None and limit == 0:
model.Minimize(objective)
# Now solve...
printer = SchedulePrinter(courses, course_vars, objective, limit)
solver = cp_model.CpSolver()
solver.parameters.enumerate_all_solutions = True
status = solver.Solve(model, printer)
printer.print_all()
return printer.solution_count()
def course_list(curr_yr, term, subject):
# Read the course list from the web; is there a cleaner location
# to get it? Parse the web page here:
#
# https://web4u.banner.wwu.edu/pls/wwis/wwskcfnd.TimeTable
#
# <select id='term' name='term'>
# default: 'All'
# format <YEAR><TERM>
#
# <INPUT TYPE='hidden' NAME='curr_yr' VALUE='2324'>
#
# <select id='subj' name='subj'>
r = _get_session().post(URI, data={
'term': term,
'curr_yr': curr_yr,
'subj': subject,
})
r.raise_for_status()
result = {}
# Like most HTML parsing, this is a hack, as the webpage
# uses tables for formatting.
#
# The strategy is to parse each table separately using bs4,
# convert it to a pandas table, extract whether this is the
# first row of a
soup = BeautifulSoup(r.text, features='lxml')
tables = soup.findAll('table')
is_data = False
key = ''
desc = ''
for table in tables:
if table.findParent('table') is None:
try:
t = pd.read_html(io.StringIO(str(table)), thousands=None)
if len(t) != 1:
continue
df = t[0]
except:
continue
if len(df.columns) >= len(HDRS) - 1 and len(df.columns) <= len(HDRS):
# This 'table' is likely a row in the course schedule.
# (the site uses bad HTML formatting, where each row is a separate table element)
# Lab/additional sections may have fewer html columns.
is_data = True
if df[0][0] == HDRS[0]:
continue # header row
else:
# This 'table' is likely to be merely a formatting construct in the webpage.
# It may be a class header row, which includes the class name, description, etc.
# detect this using regular expressions.
is_data = False
tmp = str(df[0][0]).strip()
m = COURSE_RE.match(tmp)
if m and tmp.startswith(subject):
# course name found
desc = tmp
key = f'{m.group(1).strip()} {m.group(2)}'
else:
key = ''
desc = ''
# For data tables, append them to the existing schedule table.
if is_data and key:
# Convert float -> empty string values.
for x in range(len(df.columns)):
df[x] = df[x].apply(_to_str)
if key not in result:
result[key] = df
continue
existing = result[key]
if len(df.columns) < len(HDRS):
# This is something like a lab section, or alternate meeting time on other days.
# In any case, fill in the remaining columns with empty string rather than NaN.
while len(df.columns) < len(HDRS):
df[len(df.columns)] = ''
# copy course number
df[1] = existing.iloc[-1][1]
# copy availability
df[7] = existing.iloc[-1][7]
df[8] = existing.iloc[-1][8]
df[9] = existing.iloc[-1][9]
result[key] = pd.concat([result[key], df], ignore_index=True)
return result
def filter_to_requested(courses, requested_classes, missing):
available = {}
for x in requested_classes:
if x not in courses:
missing.append(x)
continue
available[x] = courses[x]
return available
def filter_to_available(courses, include, exclude, missing):
available = {}
for x, df in courses.items():
unique = df[1].unique()
# Remove 'TBD' sections
df = df[df[2] != 'TBD']
# exclude some sections
for i in exclude:
if i in unique:
df = df[df[1] != i]
if df.empty:
continue
# include some sections
if 'all' in include:
available[x] = df
continue
# filter by available slots.
z = df[df[8] < df[9]]
for i in include:
if i in unique and i not in z[1].unique():
z = pd.concat([z, df[df[1] == i]], ignore_index=True)
if z.empty:
missing.append(x)
continue
available[x] = z
return available
def run_class_scheduler(args, course_groups, required):
print('-'*40)
print(f'Academic year {args.year} quarter {args.term}')
for it in course_groups.values():
print('One course from : ' + ' '.join(sorted(it)))
print('-'*40)
requested_classes = []
for g, c in course_groups.items():
requested_classes.extend(c)
# Reads the courses from WWU for the requested_classes and term.
subjects = set()
for x in requested_classes:
m = COURSE_RE.match(x)
if m:
subjects.add(m.group(1))
courses = {}
for x in subjects:
c = course_list(args.year, args.term, x)
print(f'Course list for {x} has {len(c)} courses')
courses.update(c)
missing = []
courses = filter_to_requested(courses, requested_classes, missing)
for c, df in courses.items():
print('-'*40)
print(c)
print(df)
available = filter_to_available(courses, args.include, args.exclude, missing)
if missing:
print('-'*40)
print(
f'Courses {",".join(missing)} have no available sections; schedule may be incomplete')
# If any of the "required" classes are unavailable, exit early.
for x in missing:
if x in required:
return
# Transform the dataframe to a dict used by generate_schedule.
transformed = {}
for k, df in available.items():
for section in df[1].unique():
time_df = df[(df[1] == section)]
for i in range(len(time_df)):
# c[k,section] implies time_slots from df[2], df[3]
week_spec = str(time_df.iloc[i][2]).strip()
timespec = str(time_df.iloc[i][3]).strip()
if k not in transformed:
transformed[k] = {}
if section not in transformed[k]:
transformed[k][section] = {}
transformed[k][section][week_spec] = timespec
schedules = generate_schedule(transformed, course_groups, args.preferred_time, args.limit)
print('~'*40)
print( f'Generated {schedules} possible schedules for selected courses.')
print()
def main(argv):
today = dt.date.today()
# Determine the academic year for the scheduler, which is e.g. 2324
academic_year = today.year
if today.month < 5:
academic_year = academic_year-1
default_year = "%02d%02d" %(academic_year-2000, academic_year-1999)
# Determine the term (quarter) for the scheduler
# Term is composed of 'YYYY' + Suffix:
# May 1: Beginning of registration for FALL: 40
# Nov 1: Beginning of registration for WINTER: 10
# Feb 15: Beginning of registration for SPRING: 20
# Summer (30) is never auto-selected.
if today >= dt.date(year=academic_year, month=5, day=1):
default_term="%04d40" % (academic_year,)
if today >= dt.date(year=academic_year, month=11, day=1):
default_term="%04d10" % (academic_year+1,)
if today >= dt.date(year=academic_year+1, month=2, day=15):
default_term="%04d20" % (academic_year+1,)
parser = argparse.ArgumentParser(
prog=argv[0], description='Attempt to generate WWU schedule.')
parser.add_argument('-t', '--term', type=str, nargs='?', default=default_term,
help="YYYYTT, where YYYY is the year, and TT is the term (Winter=10, Fall=40).")
parser.add_argument('-y', '--year', type=str, nargs='?', default=default_year,
help="School year. For example, --year=2324.")
parser.add_argument('-i', '--include', action='append', default=[],
help="Include these sections (course numbers) even they have no space.")
parser.add_argument('-e', '--exclude', action='append', default=[],
help="Exclude these sections (course numbers).")
parser.add_argument('--preferred-time', type=str, default=None,
help="Prefer classes which fall within this time range. Example: '10:00-3:30 pm'")
parser.add_argument('--limit', default=None, type=int,
help="Print the N best scores.")
args, unknown = parser.parse_known_args(argv[1:])
failed = False
course_groups = {}
required = set()
mode = '-required'
groupnum = 0
for x in unknown:
if x.startswith('-'):
x = x.lower()
if x in ['-limit', '-term', '-year', '-include', '-exclude', '-preferred-time']:
print(f'Argument should use --; -{x}', file=sys.stderr)
failed = True
if x != mode and x != '-oneof':
groupnum = groupnum + 1
mode = x
continue
course = x.upper().replace('_', ' ')
if mode.startswith('-r'):
groupnum = groupnum + 1
required.add(course)
course_groups.setdefault(f'{mode}-{groupnum}', set()).add(course)
if failed or not course_groups:
print('Error specifying courses.', file=sys.stderr)
print('Example: ', file=sys.stderr)
print(f' python3 {argv[0]} ENG_101 PHYS_161 HIST_112', file=sys.stderr)
print('', file=sys.stderr)
return
run_class_scheduler(args, course_groups, required)
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment