Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Parse timetable from
def parse_schedule_page(content):
doc = lxml.html.document_fromstring(content)
rows = doc.xpath('//table[@class="reqform"]/tr/td')
print "Rows:", len(rows)
description = ''
if len(rows):
description = " ".join([line.text_content().strip() for line in rows[0].xpath('.//h3')])
schedule = {
'description': description,
'waypoints': []
if len(rows) < 4:
return schedule
for r in xrange(1, len(rows)-2, 2):
head_row = rows[r]
title = head_row.xpath('.//h2')[0].text_content().strip()
content_row = rows[r+1]
timelines = []
hour = ""
for td in content_row.xpath('./table/tr/td'):
hour_span = td.xpath('./span[@class="hour"]')
if len(hour_span):
hour = hour_span[0].text_content().strip()
minutes_span = td.xpath('./span[@class="minutes"]')
if len(minutes_span) and not len(hour):
for span in minutes_span:
minutes = span.text_content().strip()
if len(minutes):
timelines.append('%s:%s' % (hour, minutes))
'title': title,
'times': timelines
return schedule
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment