Last active
September 11, 2016 00:25
-
-
Save Z1ni/50b2cfd6c22d63d14ee638ddbd0d94d8 to your computer and use it in GitHub Desktop.
Parses information about CS courses at University of Tampere
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import requests | |
import re | |
import time | |
import json | |
from argparse import ArgumentParser | |
from bs4 import BeautifulSoup as bs4 | |
from datetime import datetime as dt | |
from datetime import timedelta | |
from io import StringIO | |
def is_period_name(tag): | |
return tag.has_attr("id") and tag["id"].lower()[:13] == "opsi_periodi_" | |
def get_period_info(p): | |
i = p.find(is_period_name) | |
if i is None: | |
# print("Can't find period info tag") | |
return (None, None, None) | |
m = re.match(r"^opsi_periodi_(\d)", i["id"].lower()) | |
if m is None: | |
print("Can't parse period id from \"%s\"" % i["id"].lower()) | |
return (None, None, None) | |
p_id = int(m.group(1)) | |
p_str = i.text.strip().replace("\r", "").replace("\t", "").replace("\n", "") | |
m = re.findall(r"\((\d{1,2}\.\d{1,2}\.\d{4}).*?(\d{1,2}\.\d{1,2}\.\d{4})\)", p_str) | |
if len(m) == 0: | |
print("Can't parse start and end date from \"%s\"" % p_str) | |
return (None, None, None) | |
p_start = dt.strptime(m[0][0], "%d.%m.%Y") | |
p_end = dt.strptime(m[0][1], "%d.%m.%Y") | |
return (p_id, p_start, p_end) | |
def parse_points(raw): | |
raw = raw.lower() | |
if '–' in raw or '-' in raw: | |
# E.g. "1-3 op" / "1-3 ects" | |
m = re.match(r"^(?P<min>\d+)\W(?P<max>\d+)\s(?P<type>(?:op)|(?:ects))", raw) | |
if m is None: | |
return False | |
p_min = int(m.group("min")) | |
p_max = int(m.group("max")) | |
p_type = m.group("type") | |
return (p_min, p_max, p_type) | |
# E.g. "5 op" / "5 ects" | |
m = re.match(r"^(?P<amount>\d+)\s(?P<type>(?:op)|(?:ects))", raw) | |
if m is None: | |
return False | |
amount = int(m.group("amount")) | |
p_type = m.group("type") | |
return (amount, amount, p_type) | |
def json_serial(obj): | |
"""JSON serializer for objects not serializable by default json code""" | |
if isinstance(obj, dt): | |
serial = obj.isoformat() | |
return serial | |
raise TypeError ("Type not serializable") | |
def get_course_info(cid): | |
base_url = "https://www10.uta.fi/opas/opetusohjelma/marjapuuro.htm" | |
params = { | |
"id": cid | |
} | |
head = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0" | |
} | |
r = requests.get(base_url, params=params, headers=head) | |
bs = bs4(r.text, "html.parser") | |
# Parse period information | |
infobox = bs.find("div", class_="marjapuuro_infobox") | |
img_tags = infobox.find("div", class_="opsi_toteuma_periodit").find_all("img") | |
c_periods = [] | |
pid = 1 | |
for s in img_tags: | |
if pid > 4: | |
break | |
if len(re.findall(("periodi%d\." % pid), s["src"])) > 0: | |
c_periods.append(pid) | |
pid += 1 | |
# Parse teaching language | |
try: | |
c_lang = [t for t in bs.find_all("div", class_="infobox_header") if t.get_text().lower() == "opetuskieli" or t.get_text().lower() == "language of instruction"][0].findNextSibling("div", class_="opsi_toteuma_kentta").get_text(strip=True).lower() | |
# Crappy coversion to language codes | |
if c_lang == "suomi" or c_lang == "finnish": | |
c_lang = "fi" | |
elif c_lang == "englanti" or c_lang == "english": | |
c_lang = "en" | |
except: | |
c_lang = None | |
# Parse teacher information | |
teachers = [] | |
teachers_data = bs.find_all("div", class_="ope") | |
for t in teachers_data: | |
t_d = [s for s in t.stripped_strings] | |
# print("Parsing teacher data: %s" % ", ".join(t_d)) | |
t_name_role = t_d[0] | |
t_name, t_role = [a.strip() for a in t_name_role.split(",")] | |
try: | |
t_email = t_d[1].replace("[ät]", "@") | |
except IndexError: | |
t_email = None | |
teachers.append({"name": t_name, "role": t_role, "email": t_email}) | |
# Try to find course homepage address | |
web_addr = None | |
try: | |
""" | |
<h2>Opintojakson kotisivu</h2> | |
<div class="opsi_toteuma_kentta"> | |
<a href="http://example.com/">http://example.com/</a> | |
</div> | |
""" | |
web_addr = [t for t in bs.find_all("h2") if t.get_text().lower() == "opintojakson kotisivu" or t.get_text().lower() == "homepage url"][0].findNextSibling("div", class_="opsi_toteuma_kentta").find("a")["href"] | |
except: | |
pass | |
# Try to find course start and end date | |
c_start = None | |
c_end = None | |
""" | |
<h2>Opetus</h2> | |
<div class="opsi_toteuma_kentta"> | |
8.9.2016 - 4.11.2016 | |
</div> | |
""" | |
t_data = None | |
try: | |
t_data = [t for t in bs.find_all("h2") if t.get_text().lower() == "opetus" or t.get_text().lower() == "teaching"][0].findNextSibling("div", class_="opsi_toteuma_kentta") | |
raw_dates = list(t_data.stripped_strings)[0] | |
raw_dates = raw_dates.replace("\r", "").replace("\n", "").replace("\t", "") | |
m = re.match(r"^(\d{1,2}\.\d{1,2}\.\d{4}).*?\W(\d{1,2}\.\d{1,2}\.\d{4})", raw_dates) | |
if m is None: | |
# Try other date format | |
m = re.match(r"^(\d{1,2}-.*?-\d{4}).*?\S(\d{1,2}-.*?-\d{4})", raw_dates) | |
if m is None: | |
# Give up | |
print("Date parsing failed, didn't match known formats: \"%s\"" % raw_dates) | |
else: | |
# TODO: Fails with systems that use other locale than en_US | |
# Use locale.setlocale() | |
c_start = dt.strptime(m.group(1), "%d-%b-%Y") | |
c_end = dt.strptime(m.group(2), "%d-%b-%Y") | |
else: | |
c_start = dt.strptime(m.group(1), "%d.%m.%Y") | |
c_end = dt.strptime(m.group(2), "%d.%m.%Y") | |
except: | |
pass | |
# Parse teaching info | |
hours = -1 | |
lectures = [] | |
if t_data is not None: | |
for i in t_data.find_all("div", class_="opsi_toteuma_opetustapa"): | |
for n in range(0, len(list(i.stripped_strings))): | |
possible_name = list(i.stripped_strings)[n].lower() | |
if possible_name == "luento-opetus" or possible_name == "lectures": | |
# Try to find total teaching hours | |
# Most likely the next string contains hours | |
possible_hours = list(i.stripped_strings)[n+1].lower() | |
# print("Possible hours: %s" % possible_hours) | |
m = re.match(r"^(\d+)\s(?:(?:tuntia)|(?:hours))$", possible_hours) | |
hours = -1 | |
if m is not None: | |
# TODO: try-except ValueError, even though that shouldn't happen because of regex \d | |
hours = int(m.group(1)) | |
# print("Hours: %d" % hours) | |
# Parse lecture times | |
# First one is (always?) teaching dates/times | |
try: | |
raw_lecture_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta")] | |
for p in raw_lecture_dates: | |
# print(p) | |
# Handle single dates | |
m = re.match(r"\w{2}\s(?P<date>\d{1,2}\.\d{1,2}\.\d{4})\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p) | |
if m is None: | |
# Try another format | |
m = re.match(r"\w{3}\s(?P<date>\d{1,2}-.*?-\d{4})\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p) | |
if m is not None: | |
raw_date = m.group("date") | |
raw_start = m.group("startH") | |
raw_end = m.group("endH") | |
raw_loc = m.group("loc") | |
try: | |
l_date = dt.strptime(raw_date, "%d.%m.%Y") | |
except ValueError: | |
l_date = dt.strptime(raw_date, "%d-%b-%Y") | |
l_start = l_date.replace(hour=int(raw_start)) | |
l_end = l_date.replace(hour=int(raw_end)) | |
l_loc = raw_loc | |
# TODO: Parse location data more for information | |
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": False}) | |
else: | |
# Handle weekly occurences | |
m = re.match(r"\w{2}\s(?P<startDate>\d{1,2}\.\d{1,2}\.\d{4})\s-\s(?P<endDate>\d{1,2}\.\d{1,2}\.\d{4})\sviikoittain\sklo\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p) | |
if m is None: | |
# Try another format | |
m = re.match(r"\w{3}\s(?P<startDate>\d{1,2}-.*?-\d{4})\s-\s(?P<endDate>\d{1,2}-.*?-\d{4})\sweekly\sat\s(?P<startH>\d{1,2})-(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p) | |
if m is not None: | |
raw_start_date = m.group("startDate") | |
raw_end_date = m.group("endDate") | |
raw_start_h = m.group("startH") | |
raw_end_h = m.group("endH") | |
raw_loc = m.group("loc") | |
try: | |
l_start_date = dt.strptime(raw_start_date, "%d.%m.%Y") | |
except ValueError: | |
l_start_date = dt.strptime(raw_start_date, "%d-%b-%Y") | |
try: | |
l_end_date = dt.strptime(raw_end_date, "%d.%m.%Y") | |
except ValueError: | |
l_end_date = dt.strptime(raw_end_date, "%d-%b-%Y") | |
cur_date = l_start_date | |
while cur_date <= l_end_date: | |
l_start = cur_date.replace(hour=int(raw_start_h)) | |
l_end = cur_date.replace(hour=int(raw_end_h)) | |
lectures.append({"start": l_start, "end": l_end, "location": raw_loc, "exception": False}) | |
cur_date += timedelta(days=7) # Skip to the next week | |
except Exception as e: | |
# print(e) | |
pass | |
# Parse exceptional lecture times | |
# NOTE: English date parsing is untested | |
try: | |
raw_exception_dates = [list(t.stripped_strings)[0] for t in t_data.find("div", class_="opsi_toteuma_opetustapa").find_all("div", class_="opsi_opetustapa_ajankohta_poikkeus")] | |
for p in raw_exception_dates: | |
p = p.replace("\r", "").replace("\n", "").replace("\t", "") | |
# print(p) | |
# Handle single dates | |
m = re.match(r"(?P<date>\d{1,2}\.\d{1,2}\.\d{4}).*?(?:klo\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}).*?)?,\s(?P<loc>.*?)$", p) | |
if m is None: | |
# Try another format | |
m = re.match(r"(?P<date>\d{1,2}-.*?-\d{4}).*?(?:at\s(?P<startH>\d{1,2}).*?\W(?P<endH>\d{1,2}),\s(?P<loc>.*?)$", p) | |
if m is not None: | |
raw_date = m.group("date") | |
raw_start = m.group("startH") | |
raw_end = m.group("endH") | |
raw_loc = m.group("loc") | |
try: | |
l_date = dt.strptime(raw_date, "%d.%m.%Y") | |
except ValueError: | |
l_date = dt.strptime(raw_date, "%d-%b-%Y") | |
# Get possibly existing entry | |
ex = None | |
ex_id = -1 | |
for i, l in enumerate(lectures): | |
if l["start"].strftime("%Y-%m-%d") == l_date.strftime("%Y-%m-%d"): | |
ex = l | |
ex_id = i | |
break | |
if raw_start is not None: | |
l_start = l_date.replace(hour=int(raw_start)) | |
else: | |
l_start = l_date | |
if ex is not None: | |
l_start = l_date.replace(hour=ex["start"].hour) | |
if raw_end is not None: | |
l_end = l_date.replace(hour=int(raw_end)) | |
else: | |
l_end = l_date | |
if ex is not None: | |
l_end = l_date.replace(hour=ex["end"].hour) | |
l_loc = raw_loc | |
if ex is not None: | |
# Replace existing lecture information | |
lectures[ex_id] = {"start": l_start, "end": l_end, "location": l_loc, "exception": True} | |
else: | |
lectures.append({"start": l_start, "end": l_end, "location": l_loc, "exception": True}) | |
# TODO: Handle weekly exceptions? | |
except Exception as e: | |
# print(e) | |
pass | |
# TODO: Parse exercise times | |
return {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours} | |
if __name__ == "__main__": | |
parser = ArgumentParser(description="UTA course information parser", epilog="If no flags are specified, defaults to all courses text output to standard output.") | |
parser.add_argument("-c", "--courses", type=str, dest="courses", help="Course codes (separated with a comma)") | |
parser.add_argument("-j", "--json", dest="json", action="store_true", help="Output JSON") | |
parser.add_argument("-i", "--ical", dest="ical", action="store_true", help="Output iCalendar") | |
parser.add_argument("-d", "--dest", dest="dest", type=str, help="Output to file instead of standard output") | |
args = parser.parse_args() | |
what_courses = [] | |
if args.courses is not None: | |
what_courses = args.courses.split(",") | |
base_url = "https://www10.uta.fi/opas/opetusohjelma/opetusohjelma.htm" | |
params = { | |
"kieli": "fi", | |
"ots": 15, # 15 CS BSc, 16 CS MSc | |
"lvv": 2016, | |
"ops": 142 | |
# To get all information in the same page, use: | |
# "display_long": "true" | |
# I'm not using that for now, as it makes parsing a bit tedious | |
} | |
head = { | |
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0" | |
} | |
# r = requests.get(base_url, params=params, headers=head) | |
# bs = bs4(r.text, "html.parser") | |
# Use cached page for testing | |
bs = None | |
with open("uta_marjapuuro.html", "r") as f: | |
bs = bs4(f.read(), "html.parser") | |
known_courses = [] | |
# Periods | |
periods_data = bs.find_all("div", class_="opsi_periodi") | |
periods = [] | |
for p in periods_data: | |
# Get period ID | |
p_id, start, end = get_period_info(p) | |
if p_id is None: | |
# print("Period information parsing failed!") | |
continue | |
# Courses | |
courses_data = p.find_all("div", class_="opsi_opintojakso") | |
courses = [] | |
for c in courses_data: | |
# Course | |
code = c.find("span", class_="opsi_opintojakso_koodi").text.strip() | |
if len(code) == 0: | |
# Don't add entries that don't have a code | |
continue | |
"""if code in known_courses: | |
# Skip | |
continue | |
known_courses.append(code)""" | |
# Debug | |
"""if code != "MTTTP1": | |
continue""" | |
# print(code) | |
if len(what_courses) > 0 and code not in what_courses: | |
continue | |
name_tag = c.find("span", class_="opsi_opintojakso_nimi") | |
name = name_tag.text.strip() | |
c_id = int(name_tag.find("a")["href"].split("id=")[1]) | |
points_raw = c.find("span", class_="opsi_opintojakso_laajuus").text.strip() | |
points = parse_points(points_raw) | |
if not points: | |
# Parsing failed | |
# print("Information parsing failed for course %s (%s)!" % (code, name)) | |
points = (-1, -1, "") | |
points_struct = {"min": points[0], "max": points[1], "type": points[2]} | |
c_info = get_course_info(c_id) | |
courses.append({"id": c_id, "code": code, "name": name, "points": points_struct, "info": c_info}) | |
periods.append({"id": p_id, "start": start, "end": end, "courses": courses}) | |
break | |
# TODO: Timestamp as milliseconds, because javascript (just *1000) | |
data = {"timestamp": int(time.time()), "periods": periods} | |
if args.json: | |
data["timestamp"] = data["timestamp"] * 1000 # JS likes timestamps as milliseconds | |
json_io = StringIO() | |
json.dump(data, json_io, default=json_serial) | |
json_data = json_io.getvalue() | |
if args.dest is None: | |
print(json_data) | |
else: | |
try: | |
with open(args.dest, "w") as f: | |
f.write(json_data) | |
except IOError as e: | |
print("JSON file writing failed: %s" % e) | |
elif args.ical: | |
# TODO | |
pass | |
else: | |
# Stdout | |
print("----- UTA course information -----") | |
for p in data["periods"]: | |
print("Period %d, %s - %s" % (p["id"], p["start"].strftime("%d.%m.%Y"), p["end"].strftime("%d.%m.%Y"))) | |
for c in p["courses"]: | |
# {"teachers": teachers, "homepage": web_addr, "periods": c_periods, "language": c_lang, "start": c_start, "end": c_end, "lectures": lectures, "lecture_hours": hours} | |
points = c["points"] | |
if points["min"] == points["max"]: | |
points_str = str(points["min"]) | |
else: | |
points_str = "%d-%d" % (points["min"], points["max"]) | |
print(" Course %s (%s), %s op/ects:" % (c["code"], c["name"], points_str)) | |
print(" Teacher(s):") | |
for t in c["info"]["teachers"]: | |
if t["email"] is not None: | |
print(" - %s (%s)" % (t["name"], t["email"])) | |
else: | |
print(" - %s" % t["name"]) | |
print(" Start: %s" % c["info"]["start"].strftime("%d.%m.%Y")) | |
print(" End: %s" % c["info"]["end"].strftime("%d.%m.%Y")) | |
print(" Period(s): %s" % ", ".join([str(a) for a in c["info"]["periods"]])) | |
print(" Language: %s" % c["info"]["language"]) | |
print(" Homepage: %s" % c["info"]["homepage"]) | |
print(" Lectures:") | |
for l in sorted(c["info"]["lectures"], key=lambda x: x["start"]): # Sort by dates | |
print(" %s - %s @ %s" % (l["start"].strftime("%a %d.%m.%Y %H:%M"), l["end"].strftime("%H:%M"), l["location"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment