Last active
September 15, 2016 21:39
-
-
Save tomverran/85ace6720bd6cf7af0e8f747c82c0a17 to your computer and use it in GitHub Desktop.
National rail FTP timetable downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.cElementTree as ET | |
from ftplib import FTP | |
import StringIO | |
import gzip | |
import time | |
import re | |
import json | |
import boto3 | |
import tempfile | |
client = boto3.client('s3') | |
s3 = boto3.resource('s3') | |
resp = client.get_object(Bucket = 'tv-private', Key = 'national-rail-ftp.txt') | |
password = resp["Body"].read().decode('utf-8') | |
ftp = FTP('datafeeds.nationalrail.co.uk') | |
ftp.login(user = 'ftpuser', passwd=password) | |
files = [] | |
ftp.retrlines('NLST', files.append) | |
filename = next(f for f in files if f.endswith('v8.xml.gz')) | |
timetable_gz = tempfile.TemporaryFile() | |
ftp.retrbinary('RETR ' + filename, timetable_gz.write) | |
timetable_gz.seek(0) | |
timetable = gzip.GzipFile(fileobj = timetable_gz, mode = 'rb') | |
tofind = [ | |
'BCKNHMJ <- 08:32', | |
'BCKNHMJ -> 08:41', | |
'BCKNHMJ <- 08:19' | |
] | |
ids = [] | |
for (e, elem) in ET.iterparse(timetable): | |
if(elem.tag.endswith('Journey')): | |
children = list(elem) | |
(start, departs) = (children[0].get('tpl'), children[0].get('ptd')) | |
(end, arrives) = (children[-1].get('tpl'), children[-1].get('pta')) | |
if(start and end and departs and arrives): | |
skey = start + ' -> ' + departs | |
dkey = end + ' <- ' + arrives | |
if (skey in tofind or dkey in tofind): | |
ids.append(elem.get('uid')) | |
elem.clear() | |
s3.Object('tv-timetables', 'trains.json').put(Body=json.dumps(ids)) | |
timetable_gz.close() | |
timetable.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment