Last active
March 10, 2018 09:28
-
-
Save not7cd/64753f535114d1a8dbffef408c69927f to your computer and use it in GitHub Desktop.
Scrape timetable from HTML generated by Optivum by Vulcan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import requests | |
import dateutil.parser as dparser | |
from urllib.parse import urlparse | |
from bs4 import BeautifulSoup as bs | |
def get_soup(url): | |
response = requests.get(url) | |
return bs(response.text, 'html.parser') | |
def extract_id(path): | |
# Just easy as | |
a = urlparse(path) | |
b = os.path.basename(a.path) | |
c = os.path.splitext(b)[0] | |
# That's how you make it right! | |
return c | |
def get_url_id(soup, class_): | |
return extract_id(soup.find('a', class_=class_)['href']) | |
def get_url_ids(soup, class_): | |
return [extract_id(elt['href']) for elt in soup.find_all('a', class_=class_)] | |
def dict_from_list(ul): | |
return {get_url_id(li, None): li.a.string for li in ul.find_all('li')} | |
def get_lesson(soup): | |
n = get_url_id(soup, 'n') # nauczyciel | |
o = get_url_ids(soup, 'o') # odział, klasa | |
p = soup.find('span', class_='p').string # przedmiot | |
return {'n': n, 'o': o, 'p':p} | |
def get_classroom_lessons(table): | |
rows = table.find_all('tr') | |
for row in rows[1:]: | |
hours = row.find(class_='g').string | |
for weekday, lesson in enumerate(row.find_all(class_='l')): | |
try: | |
yield {'w': weekday, 'g': hours, **get_lesson(lesson)} | |
except TypeError: | |
pass | |
def get_all_lessons(base_url, classrooms): | |
for classroom in classrooms: | |
table = get_soup(base_url + '/plany/' + classroom + '.html').find('table', class_='tabela') | |
for lesson in get_classroom_lessons(table): | |
yield {'s':classroom, **lesson} | |
def translate_internal_id(lesson, o, n, s): | |
lesson['o'] = list(map(lambda elt: o[elt], lesson['o'])) | |
lesson['n'] = n[lesson['n']] | |
lesson['s'] = s[lesson['s']] | |
return lesson | |
def parse_sitemap(sitemap): | |
return (dict_from_list(ul) for ul in sitemap.find_all('ul')) | |
def get_sitemap_date(sitemap): | |
return dparser.parse(sitemap.body.string, fuzzy=True) | |
def get_timetable(base_url): | |
sitemap = get_soup(base_url + 'lista.html') | |
date_valid = get_sitemap_date(sitemap) | |
units, teachers, classrooms = parse_sitemap(sitemap) | |
result = {'valid_from': '{:%Y-%m-%d}'.format(date_valid), 'lessons': []} | |
for lesson in get_all_lessons(base_url, classrooms): | |
lesson = translate_internal_id(lesson, units, teachers, classrooms) | |
print(lesson) | |
result['lessons'].append(lesson) | |
return result | |
if __name__ == '__main__': | |
# base url pointing to lista.html | |
BASE_URL = 'http://twoja.szkola.pl/plan/' | |
result = get_timetable(BASE_URL) | |
with open('timetable.json', 'w') as file: | |
json.dump(result, file) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.6.0 | |
python-dateutil==2.6.1 | |
requests==2.18.4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment