Skip to content

Instantly share code, notes, and snippets.

@msgre
Created April 26, 2016 07:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msgre/5e8d32e266bf8ff85a1063003d4796d5 to your computer and use it in GitHub Desktop.
Save msgre/5e8d32e266bf8ff85a1063003d4796d5 to your computer and use it in GitHub Desktop.
Homeworks parser
# -*- coding: utf-8 -*-
import re
import sys
import requests
import simplejson
from datetime import date
from pyquery import PyQuery as pq
SITE_URL = 'http://www.zerotinova4b.estranky.cz'
TERM_RE = re.compile(r'(?P<day_from>\d+)\s*\.\s*(?P<month_from>\d+)\s*\.\s*-\s*(?P<day_to>\d+)\s*\.\s*(?P<month_to>\d+)\s*\.\s*(?P<year_to>\d+)', re.UNICODE)
FMT = '%Y-%m-%d'
def get_last_plan():
"""
Vleze na prehled clanku zarazenych do kategorie "Tydenni plan"
a vytahne z nejcerstvejsiho zaznamu odkaz na detailni stranku.
Vraci: retezec, URL na detailni stranku
"""
r = requests.get('%s/clanky/tydenni-plany/' % SITE_URL)
d = pq(r.content)
anchors = d('#articles div.article h2 a')
if not anchors:
return None
return anchors[0].attrib['href']
def parse_term(title):
"""
Vyparsuje z titulku tydenniho planu obdobi, pro ktere jsou ukoly
zadany.
Vraci: slovnik se strukturou
'date_from': '2014-05-12'
'date_to': '2014-05-18'
"""
m = TERM_RE.search(title)
if m:
data = m.groupdict()
date_from = date(int(data['year_to']), int(data['month_from']), int(data['day_from']))
date_to = date(int(data['year_to']), int(data['month_to']), int(data['day_to']))
else:
date_from = None
date_to = None
return {'date_from': date_from, 'date_to': date_to}
def parse_plan(url):
"""
Vyparsuje z detailni stranky seznam ukolu a obdobi.
Vraci: slovnik se strukturou
'term': retezec s nadpisem, ve kterem je info o obdobi
'homeworks': seznam tupliku ('predmet', 'ukol')
"""
r = requests.get('%s%s' % (SITE_URL, url))
d = pq(r.content)
rows = d('div.article table tr')
if not rows:
return None
homeworks = [dict(zip(['type', 'msg'], [y.text.strip() for y in i.findall('td')]))
for i in rows if len(i.findall('td')) > 0]
title = d('div.article h2 span.span-a-title')
title = title[0].text if len(title) else ''
term = parse_term(title)
return {
'homeworks': homeworks,
'date_from': term['date_from'].strftime(FMT),
'date_to': term['date_to'].strftime(FMT)
}
def save_json(filename, data):
"""
Ulozi vyparsovana data jako JSON zadaneho jmena.
"""
with open(filename, "wt") as f:
simplejson.dump(data, f)
if __name__ == "__main__":
url = get_last_plan()
data = parse_plan(url)
save_json('ukoly.json', data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment