Skip to content

Instantly share code, notes, and snippets.

@Z1ni
Last active September 11, 2016 00:25
Show Gist options
  • Save Z1ni/50b2cfd6c22d63d14ee638ddbd0d94d8 to your computer and use it in GitHub Desktop.
Save Z1ni/50b2cfd6c22d63d14ee638ddbd0d94d8 to your computer and use it in GitHub Desktop.
Parses information about CS courses at University of Tampere
#!/usr/bin/python3
import requests
import re
import time
import json
from argparse import ArgumentParser
from bs4 import BeautifulSoup as bs4
from datetime import datetime as dt
from datetime import timedelta
from io import StringIO
def is_period_name(tag):
return tag.has_attr("id") and tag["id"].lower()[:13] == "opsi_periodi_"
def get_period_info(p):
i = p.find(is_period_name)
if i is None:
# print("Can't find period info tag")
return (None, None, None)
m = re.match(r"^opsi_periodi_(\d)", i["id"].lower())
if m is None:
print("Can't parse period id from \"%s\"" % i["id"].lower())
return (None, None, None)
p_id = int(m.group(1))
p_str = i.text.strip().replace("\r", "").replace("\t", "").replace("\n", "")
m = re.findall(r"\((\d{1,2}\.\d{1,2}\.\d{4}).*?(\d{1,2}\.\d{1,2}\.\d{4})\)", p_str)
if len(m) == 0:
print("Can't parse start and end date from \"%s\"" % p_str)
return (None, None, None)
p_start = dt.strptime(m[0][0], "%d.%m.%Y")
p_end = dt.strptime(m[0][1], "%d.%m.%Y")
return (p_id, p_start, p_end)
def parse_points(raw):
raw = raw.lower()
if '–' in raw or '-' in raw:
# E.g. "1-3 op" / "1-3 ects"
m = re.match(r"^(?P<min>\d+)\W(?P<max>\d+)\s(?P<type>(?:op)|(?:ects))", raw)
if m is None:
return False
p_min = int(m.group("min"))
p_max = int(m.group("max"))
p_type = m.group("type")
return (p_min, p_max, p_type)
# E.g. "5 op" / "5 ects"
m = re.match(r"^(?P<amount>\d+)\s(?P<type>(?:op)|(?:ects))", raw)
if m is None:
return False
amount = int(m.group("amount"))
p_type = m.group("type")
return (amount, amount, p_type)
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, dt):
serial = obj.isoformat()
return serial
raise TypeError ("Type not serializable")
def get_course_info(cid):
base_url = "https://www10.uta.fi/opas/opetusohjelma/marjapuuro.htm"
params = {
"id": cid
}
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
}
r = requests.get(base_url, params=params, headers=head)
bs = bs4(r.text, "html.parser")
# Parse period information
infobox = bs.find("div", class_="marjapuuro_infobox")
img_tags = infobox.find("div", class_="opsi_toteuma_periodit").find_all("img")
c_periods = []
pid = 1
for s in img_tags:
if pid > 4:
break
if len(re.findall(("periodi%d\." % pid), s["src"])) > 0:
c_periods.append(pid)
pid += 1
# Parse teaching language
try:
c_lang = [t for t in bs.find_all("div", class_="infobox_header") if t.get_text().lower() == "opetuskieli" or t.get_text().lower() == "language of instruction"][0].findNextSibling("div", class_="opsi_toteuma_kentta").get_text(strip=True).lower()
# Crappy coversion to language codes
if c_lang == "suomi" or c_lang == "finnish":
c_lang = "fi"
elif c_lang == "englanti" or c_lang == "english":
c_lang = "en"
except:
c_lang = None
# Parse teacher information
teachers = []
teachers_data = bs.find_all("div", class_="ope")
for t in teachers_data:
t_d = [s for s in t.stripped_strings]
# print("Parsing teacher data: %s" % ", ".join(t_d))
t_name_role = t_d[0]
t_name, t_role = [a.strip() for a in t_name_role.split(",")]
try:
t_email = t_d[1].replace("[ät]", "@")
except IndexError:
t_email = None
teachers.append({"name": t_name, "role": t_role, "email": t_email})
# Try to find course homepage address
web_addr = None
try:
"""
<h2>Opintojakson kotisivu</h2>
<div class="opsi_toteuma_kentta">
<a href="http://example.com/">http://example.com/</a>
</div>
"""
web_addr = [t for t in bs.find_all("h2") if t.get_text().lower() == "opintojakson kotisivu" or t.get_text().lower() == "homepage url"][0].findNextSibling("div", class_="opsi_toteuma_kentta").find("a")["href"]
except:
pass
# Try to find course start and end date
c_start = None
c_end = None
"""
<h2>Opetus</h2>
<div class="opsi_toteuma_kentta">
8.9.2016 - 4.11.2016
</div>
"""
t_data = None
try:
t_data = [t for t in bs.find_all("h2") if t.get_text().lower() == "opetus" or t.get_text().lower() == "teaching"][0].findNextSibling("div", class_="opsi_toteuma_kentta")
raw_dates = list(t_data.stripped_strings)[0]
raw_dates = raw_dates.replace("\r", "").replace("\n", "").replace("\t", "")
m = re.match(r"^(\d{1,2}\.\d{1,2}\.\d{4}).*?\W(\d{1,2}\.\d{1,2}\.\d{4})", raw_dates)
if m is None:
# Try other date format
m = re.match(r"^(\d{1,2}-.*?-\d{4}).*?\S(\d{1,2}-.*?-\d{4})", raw_dates)
if m is None:
# Give up
print("Date parsing failed, didn't match known formats: \"%s\"" % raw_dates)
else:
# TODO: Fails with systems that use other locale than en_US
# Use locale.setlocale()
c_start = dt.strptime(m.group(1), "%d-%b-%Y")
c_end = dt.strptime(m.group(2), "%d-%b-%Y")
else:
c_start = dt.strptime(m.group(1), "%d.%m.%Y")
c_end = dt.strptime(m.group(2), "%d.%m.%Y")
except:
pass
# Parse teaching info
hours = -1
lectures = []
if t_data is not None:
for i in t_data.find_all("div", class_="opsi_toteuma_opetustapa"):
for n in range(0, len(list(i.stripped_strings))):
possible_name = list(i.stripped_strings)[n].lower()
if possible_name == "luento-opetus" or possible_name == "lectures":
# Try to find total teaching hours
# Most likely the next string contains hours
possible_hours = list(i.stripped_strings)[n+1].lower()
# print("Possible hours: %s" % possible_hours)
m = re.match(r"^(\d+)\s(?:(?:tuntia)|(?:hours))$", possible_hours)
hours = -1
if m is not None:
# TODO: try-except ValueError, even though that shouldn't happen because of regex \d
hours = int(m.group(1))
# print("Hours: %d" % hours)
# Parse lecture times
# First one is (always?) teaching dates/times
try:
raw_lecture_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta")]
for p in raw_lecture_dates:
# print(p)
# Handle single dates
m = re.match(r"\w{2}\s(?P<date>\d{1,2}\.\d{1,2}\.\d{4})\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"\w{3}\s(?P<date>\d{1,2}-.*?-\d{4})\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_date = m.group("date")
raw_start = m.group("startH")
raw_end = m.group("endH")
raw_loc = m.group("loc")
try:
l_date = dt.strptime(raw_date, "%d.%m.%Y")
except ValueError:
l_date = dt.strptime(raw_date, "%d-%b-%Y")
l_start = l_date.replace(hour=int(raw_start))
l_end = l_date.replace(hour=int(raw_end))
l_loc = raw_loc
# TODO: Parse location data more for information
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": False})
else:
# Handle weekly occurences
m = re.match(r"\w{2}\s(?P<startDate>\d{1,2}\.\d{1,2}\.\d{4})\s-\s(?P<endDate>\d{1,2}\.\d{1,2}\.\d{4})\sviikoittain\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"\w{3}\s(?P<startDate>\d{1,2}-.*?-\d{4})\s-\s(?P<endDate>\d{1,2}-.*?-\d{4})\sweekly\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_start_date = m.group("startDate")
raw_end_date = m.group("endDate")
raw_start_h = m.group("startH")
raw_end_h = m.group("endH")
raw_loc = m.group("loc")
try:
l_start_date = dt.strptime(raw_start_date, "%d.%m.%Y")
except ValueError:
l_start_date = dt.strptime(raw_start_date, "%d-%b-%Y")
try:
l_end_date = dt.strptime(raw_end_date, "%d.%m.%Y")
except ValueError:
l_end_date = dt.strptime(raw_end_date, "%d-%b-%Y")
cur_date = l_start_date
while cur_date <= l_end_date:
l_start = cur_date.replace(hour=int(raw_start_h))
l_end = cur_date.replace(hour=int(raw_end_h))
lectures.append({"start": l_start, "end": l_end, "location": raw_loc, "exception": False})
cur_date += timedelta(days=7) # Skip to the next week
except Exception as e:
# print(e)
pass
# Parse exceptional lecture times
# NOTE: English date parsing is untested
try:
raw_exception_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta_poikkeus")]
for p in raw_exception_dates:
p = p.replace("\r", "").replace("\n", "").replace("\t", "")
# print(p)
# Handle single dates
m = re.match(r"(?P<date>\d{1,2}\.\d{1,2}\.\d{4}).*?(?:klo\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}).*?)?,\s(?P<loc>.*?)$", p)
if m is None:
# Try another format
m = re.match(r"(?P<date>\d{1,2}-.*?-\d{4}).*?(?:at\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p)
if m is not None:
raw_date = m.group("date")
raw_start = m.group("startH")
raw_end = m.group("endH")
raw_loc = m.group("loc")
try:
l_date = dt.strptime(raw_date, "%d.%m.%Y")
except ValueError:
l_date = dt.strptime(raw_date, "%d-%b-%Y")
# Get possibly existing entry
ex = None
ex_id = -1
for i, l in enumerate(lectures):
if l["start"].strftime("%Y-%m-%d") == l_date.strftime("%Y-%m-%d"):
ex = l
ex_id = i
break
if raw_start is not None:
l_start = l_date.replace(hour=int(raw_start))
else:
l_start = l_date
if ex is not None:
l_start = l_date.replace(hour=ex["start"].hour)
if raw_end is not None:
l_end = l_date.replace(hour=int(raw_end))
else:
l_end = l_date
if ex is not None:
l_end = l_date.replace(hour=ex["end"].hour)
l_loc = raw_loc
if ex is not None:
# Replace existing lecture information
lectures[ex_id] = {"start": l_start, "end": l_end, "location": l_loc, "exception": True}
else:
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": True})
# TODO: Handle weekly exceptions?
except Exception as e:
# print(e)
pass
# TODO: Parse exercise times
return {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours}
if __name__ == "__main__":
parser = ArgumentParser(description="UTA course information parser", epilog="If no flags are specified, defaults to all courses text output to standard output.")
parser.add_argument("-c", "--courses", type=str, dest="courses", help="Course codes (separated with a comma)")
parser.add_argument("-j", "--json", dest="json", action="store_true", help="Output JSON")
parser.add_argument("-i", "--ical", dest="ical", action="store_true", help="Output iCalendar")
parser.add_argument("-d", "--dest", dest="dest", type=str, help="Output to file instead of standard output")
args = parser.parse_args()
what_courses = []
if args.courses is not None:
what_courses = args.courses.split(",")
base_url = "https://www10.uta.fi/opas/opetusohjelma/opetusohjelma.htm"
params = {
"kieli": "fi",
"ots": 15, # 15 CS BSc, 16 CS MSc
"lvv": 2016,
"ops": 142
# To get all information in the same page, use:
# "display_long": "true"
# I'm not using that for now, as it makes parsing a bit tedious
}
head = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0"
}
# r = requests.get(base_url, params=params, headers=head)
# bs = bs4(r.text, "html.parser")
# Use cached page for testing
bs = None
with open("uta_marjapuuro.html", "r") as f:
bs = bs4(f.read(), "html.parser")
known_courses = []
# Periods
periods_data = bs.find_all("div", class_="opsi_periodi")
periods = []
for p in periods_data:
# Get period ID
p_id, start, end = get_period_info(p)
if p_id is None:
# print("Period information parsing failed!")
continue
# Courses
courses_data = p.find_all("div", class_="opsi_opintojakso")
courses = []
for c in courses_data:
# Course
code = c.find("span", class_="opsi_opintojakso_koodi").text.strip()
if len(code) == 0:
# Don't add entries that don't have a code
continue
"""if code in known_courses:
# Skip
continue
known_courses.append(code)"""
# Debug
"""if code != "MTTTP1":
continue"""
# print(code)
if len(what_courses) > 0 and code not in what_courses:
continue
name_tag = c.find("span", class_="opsi_opintojakso_nimi")
name = name_tag.text.strip()
c_id = int(name_tag.find("a")["href"].split("id=")[1])
points_raw = c.find("span", class_="opsi_opintojakso_laajuus").text.strip()
points = parse_points(points_raw)
if not points:
# Parsing failed
# print("Information parsing failed for course %s (%s)!" % (code, name))
points = (-1, -1, "")
points_struct = {"min": points[0], "max": points[1], "type": points[2]}
c_info = get_course_info(c_id)
courses.append({"id": c_id, "code": code, "name": name, "points": points_struct, "info": c_info})
periods.append({"id": p_id, "start": start, "end": end, "courses": courses})
break
# TODO: Timestamp as milliseconds, because javascript (just *1000)
data = {"timestamp": int(time.time()), "periods": periods}
if args.json:
data["timestamp"] = data["timestamp"] * 1000 # JS likes timestamps as milliseconds
json_io = StringIO()
json.dump(data, json_io, default=json_serial)
json_data = json_io.getvalue()
if args.dest is None:
print(json_data)
else:
try:
with open(args.dest, "w") as f:
f.write(json_data)
except IOError as e:
print("JSON file writing failed: %s" % e)
elif args.ical:
# TODO
pass
else:
# Stdout
print("----- UTA course information -----")
for p in data["periods"]:
print("Period %d, %s - %s" % (p["id"], p["start"].strftime("%d.%m.%Y"), p["end"].strftime("%d.%m.%Y")))
for c in p["courses"]:
# {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours}
points = c["points"]
if points["min"] == points["max"]:
points_str = str(points["min"])
else:
points_str = "%d-%d" % (points["min"], points["max"])
print(" Course %s (%s), %s op/ects:" % (c["code"], c["name"], points_str))
print(" Teacher(s):")
for t in c["info"]["teachers"]:
if t["email"] is not None:
print(" - %s (%s)" % (t["name"], t["email"]))
else:
print(" - %s" % t["name"])
print(" Start: %s" % c["info"]["start"].strftime("%d.%m.%Y"))
print(" End: %s" % c["info"]["end"].strftime("%d.%m.%Y"))
print(" Period(s): %s" % ", ".join([str(a) for a in c["info"]["periods"]]))
print(" Language: %s" % c["info"]["language"])
print(" Homepage: %s" % c["info"]["homepage"])
print(" Lectures:")
for l in sorted(c["info"]["lectures"], key=lambda x: x["start"]): # Sort by dates
print(" %s - %s @ %s" % (l["start"].strftime("%a %d.%m.%Y %H:%M"), l["end"].strftime("%H:%M"), l["location"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment