Skip to content

Instantly share code, notes, and snippets.

@flibbertigibbet
Last active August 29, 2015 14:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flibbertigibbet/b18e6b102fcb22549d14 to your computer and use it in GitHub Desktop.
Save flibbertigibbet/b18e6b102fcb22549d14 to your computer and use it in GitHub Desktop.
Fetch updated GTFS feeds for Massachusetts
#!/usr/bin/python
import requests, os, pickle, datetime, zipfile, subprocess, csv
class FeedFetcher():
def __init__(self, ddir=os.getcwd()):
self.ddir = ddir
self.tc = {} # time checks for GTFS fetches
self.new_use = [] # list of new feeds successfully downloaded and validated
def verify(self, file_name):
# file_name is local to download directory
f = os.path.join(self.ddir, file_name)
if not os.path.isfile(f):
print("File " + f + " not found; cannot verify it.")
return False
print("Validating feed in " + file_name + "...")
try:
p = subprocess.Popen(['feedvalidator.py', '--output=CONSOLE',
'-m', '-n', f], stdout=subprocess.PIPE)
out = p.communicate()
res = out[0].split('\n')
errct = res[-2:-1][0] # output line with count of errors/warnings
if errct.find('error') > -1:
print("Feed validator found errors in " + file_name + ": " + errct + ".")
return False
elif out[0].find('this feed is in the future,') > -1:
print("Feed validator found GTFS not in service until future for " +
file_name + ".")
return False
else:
if errct.find('successfully') > -1:
print("Feed " + file_name + " looks great: " + errct + ".")
else:
# have warnings
print("Feed " + file_name + " looks ok: " + errct[7:] + ".")
return True
except:
print("Failed to run feed validator on GTFS " + file_name + ".")
return False
print("How did we get here? In GTFS validation for " + file_name + ".")
return False # shouldn't get here
def verify_all(self):
# just run verify on all feeds in log file
time_check = os.path.join(self.ddir, 'time_checks.p')
if os.path.isfile(time_check):
tcf = open(time_check, 'rb')
self.tc = pickle.load(tcf)
tcf.close()
print("Loaded time check file.")
if self.tc.has_key('last_check'):
last_check = self.tc['last_check']
print("Last check: ")
print(last_check)
timedelt = datetime.datetime.now() - last_check
print("Time since last check: " )
print(timedelt)
else:
print("Couldn't find last check time in log file; that's odd.")
gtfs = self.tc
del gtfs['last_check']
for g in gtfs:
if g.endswith('.zip'):
if g.startswith('septa'):
self.verify('google_bus.zip')
self.verify('google_rail.zip')
else:
self.verify(g)
else:
print("What is " + g + "? That doesn't look like a GTFS file name.")
print("All done verifying!")
else:
print("No log file found! Can't verify GTFS.")
def check_header_newer(self, url, file_name):
# return 1 if newer file available to download;
# return 0 if info missing;
# return -1 if current file is most recent.
if self.tc.has_key(file_name):
last_info = self.tc.get(file_name)
hdr = requests.head(url)
hdr = hdr.headers
if hdr.get('last-modified'):
last_mod = hdr.get('last-modified')
if last_mod == last_info:
print("No new download available for " + file_name + ".")
return -1
else:
print("New download available for " + file_name + ".")
print("Last downloaded: " + last_info + ".")
print("New download posted: " + last_mod + ".")
return 1
else:
print("No last-modified header set for " + file_name + " download link.")
return 0
else:
print("Time check entry for " + file_name + " not found.")
return 0
# shouldn't happen
print("How did we get here? Failed checking header info.")
return 0
def get_stream(self, url, file_name, do_stream=True, session=None, do_verify=True):
if self.check_header_newer(url, file_name) == -1:
return False
# file_name is local to download directory
f = os.path.join(self.ddir, file_name)
print("Getting file " + f + "...")
if not session:
stream = requests.get(url, stream=do_stream)
else:
stream = session.get(url, stream=do_stream)
if stream.ok:
stream_file = open(f, 'wb')
if do_stream:
for chunk in stream.iter_content():
stream_file.write(chunk)
else:
stream_file.write(stream.content)
stream_file.close()
info = os.stat(f)
if info.st_size < 10000:
# file smaller than 10K may not be a GTFS; just warn
print('Warning:')
print("Download for " + f + " is only " + str(info.st_size) + " bytes.")
print("It may not be a valid GTFS.")
if not zipfile.is_zipfile(f):
print("BAD DOWNLOAD FOR " + f + ".")
print("Download for " + f + " is not a zip file.")
return False
if stream.headers.get('last-modified'):
self.tc[file_name] = stream.headers.get('last-modified')
else:
# format like last-modified header
self.tc[file_name] = datetime.datetime.utcnow(
).strftime("%a, %d %b %Y %H:%M:%S GMT")
print("Download completed successfully.")
# verify download
if do_verify:
if self.verify(file_name):
print("GTFS verification succeeded.")
self.new_use.append(file_name)
return True
else:
print("GTFS verification failed.")
return False
else:
print("Skipping GTFS verification in get_stream.")
# not adding to new_use here; do elsewhere
return True
else:
print("DOWNLOAD FAILED FOR " + f + ".")
return False
def fetch(self):
# pickled log of last times downloaded
time_check = os.path.join(self.ddir, 'time_checks.p')
if os.path.isfile(time_check):
tcf = open(time_check, 'rb')
self.tc = pickle.load(tcf)
tcf.close()
print("Loaded time check file.")
if self.tc.has_key('last_check'):
last_check = self.tc['last_check']
print("Last check: ")
print(last_check)
timedelt = datetime.datetime.now() - last_check
print("Time since last check: " )
print(timedelt)
else:
print("Will create new time check file.")
self.tc['last_check'] = datetime.datetime.now()
####### MBTA AND MA REGIONAL #######
# MBTA: http://www.mbta.com/rider_tools/developers/
# MA REGIONAL: http://www.massdot.state.ma.us/developersdata.aspx
###########################
ma_feeds = {'mbta': 'http://www.mbta.com/uploadedfiles/MBTA_GTFS.zip',
'berkshire': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/brta_google_transit.zip',
'brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/bat_google_transit.zip',
'cape_ann': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/cata_google_transit.zip',
'cape_cod': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ccrta_google_transit.zip',
'franklin': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/frta_google_transit.zip',
'attleboro': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/gatra_google_transit.zip',
'lowell': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/lrta_google_transit.zip',
'merrimack': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mvrta_google_transit.zip',
'metrowest': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mwrta_google_transit.zip',
'montachusett': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mart_google_transit.zip',
'nantucket': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/nrta_google_transit.zip',
'pioneer': 'http://www.pvta.com/g_trans/google_transit.zip',
'southeastern': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/srta_google_transit.zip',
'vineyard': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/vta_google_transit.zip',
'worchester': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/wrta_google_transit.zip',
'ferries': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ferries_google_transit.zip'
}
# The feeds for the private bus companies are full of errors; omitting these for now.
# 'bloom_tours': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/Bloom_google_transit.zip',
# 'boston_express': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/boston_express_google_transit.zip',
# 'coach_bus': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/coach_google_transit.zip',
# 'dattco': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/dattco_google_transit.zip',
# 'peter_pan': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/peter_pan_google_transit.zip',
# 'plymouth_brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/PB_google_transit.zip',
# 'yankee': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/yankee_google_transit.zip'
for source in ma_feeds:
print('Going to check for %s feed...' % source)
filename = '%s.zip' % source
self.get_stream(ma_feeds[source], filename, do_stream=True)
###########################
print("Downloading finished. Writing time check file...")
tcf = open(time_check, 'wb')
pickle.dump(self.tc, tcf)
tcf.close()
print("Time check file written.")
print("Writing 'new_use.csv', file of validated new downloads...")
nu = open('new_use.csv', 'wb')
nuw = csv.writer(nu)
for n in self.new_use:
print("Got new GTFS " + n)
nuw.writerow([n])
nu.close()
print("Done writing 'new_use.csv'.")
print("All done!")
###############################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment