Last active September 11, 2016 00:25
Parses information about CS courses at University of Tampere
import requests
import re
import time
import json
from argparse import ArgumentParser
from bs4 import BeautifulSoup as bs4
from datetime import datetime as dt
from datetime import timedelta
from io import StringIO
def is_period_name(tag):
return tag.has_attr("id") and tag["id"].lower()[:13] == "opsi_periodi_"
def get_period_info(p):
i = p.find(is_period_name)
if i is None:
# print("Can't find period info tag")
return (None, None, None)
m = re.match(r"^opsi_periodi_(\d)", i["id"].lower())
if m is None:
print("Can't parse period id from \"%s\"" % i["id"].lower())
return (None, None, None)
p_id = int(
p_str = i.text.strip().replace("\r", "").replace("\t", "").replace("\n", "")
m = re.findall(r"\((\d{1,2}\.\d{1,2}\.\d{4}).*?(\d{1,2}\.\d{1,2}\.\d{4})\)", p_str)
if len(m) == 0:
print("Can't parse start and end date from \"%s\"" % p_str)
return (None, None, None)
p_start = dt.strptime(m[0][0], "%d.%m.%Y")
p_end = dt.strptime(m[0][1], "%d.%m.%Y")
return (p_id, p_start, p_end)
def parse_points(raw):
raw = raw.lower()
if '–' in raw or '-' in raw:
# E.g. "1-3 op" / "1-3 ects"
m = re.match(r"^(?P<min>\d+)\W(?P<max>\d+)\s(?P<type>(?:op)|(?:ects))", raw)
if m is None:
return False
p_min = int("min"))
p_max = int("max"))
p_type ="type")
return (p_min, p_max, p_type)
# E.g. "5 op" / "5 ects"
m = re.match(r"^(?P<amount>\d+)\s(?P<type>(?:op)|(?:ects))", raw)
if m is None:
return False
amount = int("amount"))
p_type ="type")
return (amount, amount, p_type)
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, dt):
serial = obj.isoformat()
return serial
raise TypeError ("Type not serializable")
def get_course_info(cid):
base_url = ""
params = {
"id": cid
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
r = requests.get(base_url, params=params, headers=head)
bs = bs4(r.text, "html.parser")
# Parse period information
infobox = bs.find("div", class_="marjapuuro_infobox")
img_tags = infobox.find("div", class_="opsi_toteuma_periodit").find_all("img")
c_periods = []
pid = 1
for s in img_tags:
if pid > 4:
if len(re.findall(("periodi%d\." % pid), s["src"])) > 0:
pid += 1
# Parse teaching language
c_lang = [t for t in bs.find_all("div", class_="infobox_header") if t.get_text().lower() == "opetuskieli" or t.get_text().lower() == "language of instruction"][0].findNextSibling("div", class_="opsi_toteuma_kentta").get_text(strip=True).lower()
# Crappy coversion to language codes
if c_lang == "suomi" or c_lang == "finnish":
c_lang = "fi"
elif c_lang == "englanti" or c_lang == "english":
c_lang = "en"
c_lang = None
# Parse teacher information
teachers = []
teachers_data = bs.find_all("div", class_="ope")
for t in teachers_data:
t_d = [s for s in t.stripped_strings]
# print("Parsing teacher data: %s" % ", ".join(t_d))
t_name_role = t_d[0]
t_name, t_role = [a.strip() for a in t_name_role.split(",")]
t_email = t_d[1].replace("[ät]", "@")
except IndexError:
t_email = None
teachers.append({"name": t_name, "role": t_role, "email": t_email})
# Try to find course homepage address
web_addr = None
<h2>Opintojakson kotisivu</h2>
<div class="opsi_toteuma_kentta">
<a href=""></a>
web_addr = [t for t in bs.find_all("h2") if t.get_text().lower() == "opintojakson kotisivu" or t.get_text().lower() == "homepage url"][0].findNextSibling("div", class_="opsi_toteuma_kentta").find("a")["href"]
# Try to find course start and end date
c_start = None
c_end = None
<div class="opsi_toteuma_kentta">
8.9.2016 - 4.11.2016
t_data = None
t_data = [t for t in bs.find_all("h2") if t.get_text().lower() == "opetus" or t.get_text().lower() == "teaching"][0].findNextSibling("div", class_="opsi_toteuma_kentta")
raw_dates = list(t_data.stripped_strings)[0]
raw_dates = raw_dates.replace("\r", "").replace("\n", "").replace("\t", "")
m = re.match(r"^(\d{1,2}\.\d{1,2}\.\d{4}).*?\W(\d{1,2}\.\d{1,2}\.\d{4})", raw_dates)
if m is None:
# Try other date format
m = re.match(r"^(\d{1,2}-.*?-\d{4}).*?\S(\d{1,2}-.*?-\d{4})", raw_dates)
if m is None:
# Give up
print("Date parsing failed, didn't match known formats: \"%s\"" % raw_dates)
# TODO: Fails with systems that use other locale than en_US
# Use locale.setlocale()
c_start = dt.strptime(, "%d-%b-%Y")
c_end = dt.strptime(, "%d-%b-%Y")
c_start = dt.strptime(, "%d.%m.%Y")
c_end = dt.strptime(, "%d.%m.%Y")
# Parse teaching info
hours = -1
lectures = []
if t_data is not None:
for i in t_data.find_all("div", class_="opsi_toteuma_opetustapa"):
for n in range(0, len(list(i.stripped_strings))):
possible_name = list(i.stripped_strings)[n].lower()
if possible_name == "luento-opetus" or possible_name == "lectures":
# Try to find total teaching hours
# Most likely the next string contains hours
possible_hours = list(i.stripped_strings)[n+1].lower()
# print("Possible hours: %s" % possible_hours)
m = re.match(r"^(\d+)\s(?:(?:tuntia)|(?:hours))$", possible_hours)
hours = -1
if m is not None:
# TODO: try-except ValueError, even though that shouldn't happen because of regex \d
hours = int(
# print("Hours: %d" % hours)
# Parse lecture times
# First one is (always?) teaching dates/times
raw_lecture_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta")]
for p in raw_lecture_dates:
# print(p)
# Handle single dates
m = re.match(r"\w{2}\s(?P<date>\d{1,2}\.\d{1,2}\.\d{4})\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"\w{3}\s(?P<date>\d{1,2}-.*?-\d{4})\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_date ="date")
raw_start ="startH")
raw_end ="endH")
raw_loc ="loc")
l_date = dt.strptime(raw_date, "%d.%m.%Y")
except ValueError:
l_date = dt.strptime(raw_date, "%d-%b-%Y")
l_start = l_date.replace(hour=int(raw_start))
l_end = l_date.replace(hour=int(raw_end))
l_loc = raw_loc
# TODO: Parse location data more for information
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": False})
# Handle weekly occurences
m = re.match(r"\w{2}\s(?P<startDate>\d{1,2}\.\d{1,2}\.\d{4})\s-\s(?P<endDate>\d{1,2}\.\d{1,2}\.\d{4})\sviikoittain\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"\w{3}\s(?P<startDate>\d{1,2}-.*?-\d{4})\s-\s(?P<endDate>\d{1,2}-.*?-\d{4})\sweekly\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_start_date ="startDate")
raw_end_date ="endDate")
raw_start_h ="startH")
raw_end_h ="endH")
raw_loc ="loc")
l_start_date = dt.strptime(raw_start_date, "%d.%m.%Y")
except ValueError:
l_start_date = dt.strptime(raw_start_date, "%d-%b-%Y")
l_end_date = dt.strptime(raw_end_date, "%d.%m.%Y")
except ValueError:
l_end_date = dt.strptime(raw_end_date, "%d-%b-%Y")
cur_date = l_start_date
while cur_date <= l_end_date:
l_start = cur_date.replace(hour=int(raw_start_h))
l_end = cur_date.replace(hour=int(raw_end_h))
lectures.append({"start": l_start, "end": l_end, "location": raw_loc, "exception": False})
cur_date += timedelta(days=7) # Skip to the next week
except Exception as e:
# print(e)
# Parse exceptional lecture times
# NOTE: English date parsing is untested
raw_exception_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta_poikkeus")]
for p in raw_exception_dates:
p = p.replace("\r", "").replace("\n", "").replace("\t", "")
# print(p)
# Handle single dates
m = re.match(r"(?P<date>\d{1,2}\.\d{1,2}\.\d{4}).*?(?:klo\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}).*?)?,\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"(?P<date>\d{1,2}-.*?-\d{4}).*?(?:at\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_date ="date")
raw_start ="startH")
raw_end ="endH")
raw_loc ="loc")
l_date = dt.strptime(raw_date, "%d.%m.%Y")
except ValueError:
l_date = dt.strptime(raw_date, "%d-%b-%Y")
# Get possibly existing entry
ex = None
ex_id = -1
for i, l in enumerate(lectures):
if l["start"].strftime("%Y-%m-%d") == l_date.strftime("%Y-%m-%d"):
ex = l
ex_id = i
if raw_start is not None:
l_start = l_date.replace(hour=int(raw_start))
l_start = l_date
if ex is not None:
l_start = l_date.replace(hour=ex["start"].hour)
if raw_end is not None:
l_end = l_date.replace(hour=int(raw_end))
l_end = l_date
if ex is not None:
l_end = l_date.replace(hour=ex["end"].hour)
l_loc = raw_loc
if ex is not None:
# Replace existing lecture information
lectures[ex_id] = {"start": l_start, "end": l_end, "location": l_loc, "exception": True}
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": True})
# TODO: Handle weekly exceptions?
except Exception as e:
# print(e)
# TODO: Parse exercise times
return {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours}
if __name__ == "__main__":
parser = ArgumentParser(description="UTA course information parser", epilog="If no flags are specified, defaults to all courses text output to standard output.")
parser.add_argument("-c", "--courses", type=str, dest="courses", help="Course codes (separated with a comma)")
parser.add_argument("-j", "--json", dest="json", action="store_true", help="Output JSON")
parser.add_argument("-i", "--ical", dest="ical", action="store_true", help="Output iCalendar")
parser.add_argument("-d", "--dest", dest="dest", type=str, help="Output to file instead of standard output")
args = parser.parse_args()
what_courses = []
if is not None:
what_courses =",")
base_url = ""
params = {
"kieli": "fi",
"ots": 15, # 15 CS BSc, 16 CS MSc
"lvv": 2016,
"ops": 142
# To get all information in the same page, use:
# "display_long": "true"
# I'm not using that for now, as it makes parsing a bit tedious
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
# r = requests.get(base_url, params=params, headers=head)
# bs = bs4(r.text, "html.parser")
# Use cached page for testing
bs = None
with open("uta_marjapuuro.html", "r") as f:
bs = bs4(, "html.parser")
known_courses = []
# Periods
periods_data = bs.find_all("div", class_="opsi_periodi")
periods = []
for p in periods_data:
# Get period ID
p_id, start, end = get_period_info(p)
if p_id is None:
# print("Period information parsing failed!")
# Courses
courses_data = p.find_all("div", class_="opsi_opintojakso")
courses = []
for c in courses_data:
# Course
code = c.find("span", class_="opsi_opintojakso_koodi").text.strip()
if len(code) == 0:
# Don't add entries that don't have a code
"""if code in known_courses:
# Skip
# Debug
"""if code != "MTTTP1":
# print(code)
if len(what_courses) > 0 and code not in what_courses:
name_tag = c.find("span", class_="opsi_opintojakso_nimi")
name = name_tag.text.strip()
c_id = int(name_tag.find("a")["href"].split("id=")[1])
points_raw = c.find("span", class_="opsi_opintojakso_laajuus").text.strip()
points = parse_points(points_raw)
if not points:
# Parsing failed
# print("Information parsing failed for course %s (%s)!" % (code, name))
points = (-1, -1, "")
points_struct = {"min": points[0], "max": points[1], "type": points[2]}
c_info = get_course_info(c_id)
courses.append({"id": c_id, "code": code, "name": name, "points": points_struct, "info": c_info})
periods.append({"id": p_id, "start": start, "end": end, "courses": courses})
# TODO: Timestamp as milliseconds, because javascript (just *1000)
data = {"timestamp": int(time.time()), "periods": periods}
if args.json:
data["timestamp"] = data["timestamp"] * 1000 # JS likes timestamps as milliseconds
json_io = StringIO()
json.dump(data, json_io, default=json_serial)
json_data = json_io.getvalue()
if args.dest is None:
with open(args.dest, "w") as f:
except IOError as e:
print("JSON file writing failed: %s" % e)
elif args.ical:
# Stdout
print("----- UTA course information -----")
for p in data["periods"]:
print("Period %d, %s - %s" % (p["id"], p["start"].strftime("%d.%m.%Y"), p["end"].strftime("%d.%m.%Y")))
for c in p["courses"]:
# {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours}
points = c["points"]
if points["min"] == points["max"]:
points_str = str(points["min"])
points_str = "%d-%d" % (points["min"], points["max"])
print(" Course %s (%s), %s op/ects:" % (c["code"], c["name"], points_str))
print(" Teacher(s):")
for t in c["info"]["teachers"]:
if t["email"] is not None:
print(" - %s (%s)" % (t["name"], t["email"]))
print(" - %s" % t["name"])
print(" Start: %s" % c["info"]["start"].strftime("%d.%m.%Y"))
print(" End: %s" % c["info"]["end"].strftime("%d.%m.%Y"))
print(" Period(s): %s" % ", ".join([str(a) for a in c["info"]["periods"]]))
print(" Language: %s" % c["info"]["language"])
print(" Homepage: %s" % c["info"]["homepage"])
print(" Lectures:")
for l in sorted(c["info"]["lectures"], key=lambda x: x["start"]): # Sort by dates
print(" %s - %s @ %s" % (l["start"].strftime("%a %d.%m.%Y %H:%M"), l["end"].strftime("%H:%M"), l["location"]))
