Skip to content

Instantly share code, notes, and snippets.

Last active March 10, 2018 09:28
Show Gist options
  • Save not7cd/64753f535114d1a8dbffef408c69927f to your computer and use it in GitHub Desktop.
Save not7cd/64753f535114d1a8dbffef408c69927f to your computer and use it in GitHub Desktop.
Scrape timetable from HTML generated by Optivum by Vulcan
import os
import json
import requests
import dateutil.parser as dparser
from urllib.parse import urlparse
from bs4 import BeautifulSoup as bs
def get_soup(url):
response = requests.get(url)
return bs(response.text, 'html.parser')
def extract_id(path):
# Just easy as
a = urlparse(path)
b = os.path.basename(a.path)
c = os.path.splitext(b)[0]
# That's how you make it right!
return c
def get_url_id(soup, class_):
return extract_id(soup.find('a', class_=class_)['href'])
def get_url_ids(soup, class_):
return [extract_id(elt['href']) for elt in soup.find_all('a', class_=class_)]
def dict_from_list(ul):
return {get_url_id(li, None): li.a.string for li in ul.find_all('li')}
def get_lesson(soup):
n = get_url_id(soup, 'n') # nauczyciel
o = get_url_ids(soup, 'o') # odział, klasa
p = soup.find('span', class_='p').string # przedmiot
return {'n': n, 'o': o, 'p':p}
def get_classroom_lessons(table):
rows = table.find_all('tr')
for row in rows[1:]:
hours = row.find(class_='g').string
for weekday, lesson in enumerate(row.find_all(class_='l')):
yield {'w': weekday, 'g': hours, **get_lesson(lesson)}
except TypeError:
def get_all_lessons(base_url, classrooms):
for classroom in classrooms:
table = get_soup(base_url + '/plany/' + classroom + '.html').find('table', class_='tabela')
for lesson in get_classroom_lessons(table):
yield {'s':classroom, **lesson}
def translate_internal_id(lesson, o, n, s):
lesson['o'] = list(map(lambda elt: o[elt], lesson['o']))
lesson['n'] = n[lesson['n']]
lesson['s'] = s[lesson['s']]
return lesson
def parse_sitemap(sitemap):
return (dict_from_list(ul) for ul in sitemap.find_all('ul'))
def get_sitemap_date(sitemap):
return dparser.parse(sitemap.body.string, fuzzy=True)
def get_timetable(base_url):
sitemap = get_soup(base_url + 'lista.html')
date_valid = get_sitemap_date(sitemap)
units, teachers, classrooms = parse_sitemap(sitemap)
result = {'valid_from': '{:%Y-%m-%d}'.format(date_valid), 'lessons': []}
for lesson in get_all_lessons(base_url, classrooms):
lesson = translate_internal_id(lesson, units, teachers, classrooms)
return result
if __name__ == '__main__':
# base url pointing to lista.html
result = get_timetable(BASE_URL)
with open('timetable.json', 'w') as file:
json.dump(result, file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment