-
-
Save copy/f0642d4782110823ffdb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import re | |
import urllib | |
import time | |
import HTMLParser | |
import os | |
'''try: | |
os.mkdir('/tmp/bahn/') | |
except: | |
pass''' | |
url = "http://reiseauskunft.bahn.de/bin/bhftafel.exe/dn?ld=&country=DEU&rt=1&input=Wattenscheid-H%F6ntrop%238006227&boardType=dep&time=actual&productsFilter=11111&REQTrain_name=S1&start=yes" | |
handle = urllib.urlopen(url) | |
source = handle.read() | |
handle.close() | |
next_trains = [] | |
parser = HTMLParser.HTMLParser() | |
def remove_html(str): | |
# remove html tags, unescape entities and trim | |
str = parser.unescape(re.sub('<[^>]*>', ' ', str)) | |
str = str.strip() | |
str = str.replace(u'\xfc', 'ue') | |
str = str.replace(u'\xdc', 'Ue') | |
str = str.replace(u'\xe4', 'ae') | |
str = str.replace(u'\xc4', 'Ae') | |
str = str.replace(u'\xf6', 'oe') | |
str = str.replace(u'\xD6', 'Oe') | |
str = re.sub('[ \t]+', ' ', str) | |
return str | |
def abbreviate_station(str): | |
# abbriate long station names | |
str = str.replace('Wattenscheid', 'Wat') | |
str = str.replace('Duesseldorf', 'D\'dorf') | |
str = str.replace(' Hbf', '') | |
return str | |
def abbreviate_status(str): | |
str = re.sub('ca\...', '', str) | |
str = str.replace(' , Grund:', ';') | |
return str | |
for single_train in source.split('<td class="time">')[2:-1]: | |
time_str = re.search(r'(\d\d:\d\d)', single_train) | |
dest_html = re.search(r'<td class="route">(.*?)</td>', single_train, re.S) | |
platform_html = re.search(r'<td class="platform">(.*?)</td>', single_train, re.S) | |
status_html = re.search(r'<td class="ris">(.*?)</td>', single_train, re.S) | |
#print platform_html | |
#print single_train | |
if not all([time_str, dest_html, platform_html]): | |
print '= INVALID ENTRY =' | |
#print single_train | |
print time_str | |
print dest_html | |
print platform_html | |
print status_html | |
print '' | |
continue | |
#raise Exception('Parse error: not all match') | |
if not status_html: | |
status = '(no status)' | |
else: | |
status = remove_html(status_html.group(1)) | |
arrival_time = time.strptime(time_str.group(1), '%H:%M') | |
dest, stations = remove_html(dest_html.group(1)).split('\n', 1) | |
platform = remove_html(platform_html.group(1)) | |
next_trains.append( (arrival_time, dest, stations, platform, status) ) | |
#print next_trains | |
file = open('/tmp/bahn', 'w+') | |
if len(next_trains) > 1: | |
dest_max_width = max(len(abbreviate_station(dest)) for _, dest, _, _, _ in next_trains) | |
for arrival, dest, stations, platform, status in next_trains: | |
line = '' | |
line += time.strftime('%H:%M', arrival) + ' | ' | |
line += (abbreviate_station(dest).rjust(dest_max_width).encode('utf8')) + ' | ' | |
#line += platform.encode('utf8') + ' | ' | |
line += abbreviate_status(status.encode('utf8')) + '\n' | |
file.write(line) | |
print line, | |
else: | |
file.write("No trains found") | |
file.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment