flibbertigibbet/FeedFetcherMA.py

## FeedFetcherMA.py
#!/usr/bin/python

import requests, os, pickle, datetime, zipfile, subprocess, csv


class FeedFetcher():
  def __init__(self, ddir=os.getcwd()):
    self.ddir = ddir
    self.tc = {} # time checks for GTFS fetches
    self.new_use = [] # list of new feeds successfully downloaded and validated

  def verify(self, file_name):
    # file_name is local to download directory
    f = os.path.join(self.ddir, file_name)
    if not os.path.isfile(f):
      print("File " + f + " not found; cannot verify it.")
      return False

    print("Validating feed in " + file_name + "...")
    try:
      p = subprocess.Popen(['feedvalidator.py', '--output=CONSOLE',
        '-m', '-n', f], stdout=subprocess.PIPE)
      out = p.communicate()
      res = out[0].split('\n')
      errct = res[-2:-1][0] # output line with count of errors/warnings
      if errct.find('error') > -1:
        print("Feed validator found errors in " + file_name + ":  " + errct + ".")
        return False
      elif out[0].find('this feed is in the future,') > -1:
        print("Feed validator found GTFS not in service until future for " +
          file_name + ".")
        return False
      else:
        if errct.find('successfully') > -1:
          print("Feed " + file_name + " looks great:  " + errct + ".")
        else:
          # have warnings
          print("Feed " + file_name + " looks ok:  " + errct[7:] + ".")
        return True
    except:
      print("Failed to run feed validator on GTFS " + file_name + ".")
      return False

    print("How did we get here?  In GTFS validation for " + file_name + ".")
    return False # shouldn't get here

  def verify_all(self):
    # just run verify on all feeds in log file
    time_check = os.path.join(self.ddir, 'time_checks.p')
    if os.path.isfile(time_check):
      tcf = open(time_check, 'rb')
      self.tc = pickle.load(tcf)
      tcf.close()
      print("Loaded time check file.")
      if self.tc.has_key('last_check'):
        last_check = self.tc['last_check']
        print("Last check: ")
        print(last_check)
        timedelt = datetime.datetime.now() - last_check
        print("Time since last check: " )
        print(timedelt)
      else:
        print("Couldn't find last check time in log file; that's odd.")
      gtfs = self.tc
      del gtfs['last_check']
      for g in gtfs:
        if g.endswith('.zip'):
          if g.startswith('septa'):
            self.verify('google_bus.zip')
            self.verify('google_rail.zip')
          else:
            self.verify(g)
        else:
          print("What is " + g + "?  That doesn't look like a GTFS file name.")
      print("All done verifying!")
    else:
      print("No log file found!  Can't verify GTFS.")

  def check_header_newer(self, url, file_name):
    # return 1 if newer file available to download;
    # return 0 if info missing;
    # return -1 if current file is most recent.
    if self.tc.has_key(file_name):
      last_info = self.tc.get(file_name)
      hdr = requests.head(url)
      hdr = hdr.headers
      if hdr.get('last-modified'):
        last_mod = hdr.get('last-modified')
        if last_mod == last_info:
          print("No new download available for " + file_name + ".")
          return -1
        else:
          print("New download available for " + file_name + ".")
          print("Last downloaded: " + last_info + ".")
          print("New download posted: " + last_mod + ".")
          return 1
      else:
        print("No last-modified header set for " + file_name + " download link.")
        return 0
    else:
      print("Time check entry for " + file_name + " not found.")
      return 0

    # shouldn't happen
    print("How did we get here?  Failed checking header info.")
    return 0

  def get_stream(self, url, file_name, do_stream=True, session=None, do_verify=True):
    if self.check_header_newer(url, file_name) == -1:
      return False
    # file_name is local to download directory
    f = os.path.join(self.ddir, file_name)
    print("Getting file " + f + "...")
    if not session:
      stream = requests.get(url, stream=do_stream)
    else:
      stream = session.get(url, stream=do_stream)

    if stream.ok:
      stream_file = open(f, 'wb')
      if do_stream:
        for chunk in stream.iter_content():
          stream_file.write(chunk)
      else:
        stream_file.write(stream.content)

      stream_file.close()
      info = os.stat(f)
      if info.st_size < 10000:
        # file smaller than 10K may not be a GTFS; just warn
        print('Warning:')
        print("Download for " + f + " is only " + str(info.st_size) + " bytes.")
        print("It may not be a valid GTFS.")
      if not zipfile.is_zipfile(f):
        print("BAD DOWNLOAD FOR " + f + ".")
        print("Download for " + f + " is not a zip file.")
        return False
      if stream.headers.get('last-modified'):
        self.tc[file_name] = stream.headers.get('last-modified')
      else:
        # format like last-modified header
        self.tc[file_name] = datetime.datetime.utcnow(
          ).strftime("%a, %d %b %Y %H:%M:%S GMT")
      print("Download completed successfully.")
      # verify download
      if do_verify:
        if self.verify(file_name):
          print("GTFS verification succeeded.")
          self.new_use.append(file_name)
          return True
        else:
          print("GTFS verification failed.")
          return False
      else:
        print("Skipping GTFS verification in get_stream.")
        # not adding to new_use here; do elsewhere
        return True
    else:
      print("DOWNLOAD FAILED FOR " + f + ".")

    return False

  def fetch(self):
    # pickled log of last times downloaded
    time_check = os.path.join(self.ddir, 'time_checks.p')
    if os.path.isfile(time_check):
      tcf = open(time_check, 'rb')
      self.tc = pickle.load(tcf)
      tcf.close()
      print("Loaded time check file.")
      if self.tc.has_key('last_check'):
        last_check = self.tc['last_check']
        print("Last check: ")
        print(last_check)
        timedelt = datetime.datetime.now() - last_check
        print("Time since last check: " )
        print(timedelt)
    else:
      print("Will create new time check file.")

    self.tc['last_check'] = datetime.datetime.now()

    ####### MBTA AND MA REGIONAL #######
    # MBTA: http://www.mbta.com/rider_tools/developers/
    # MA REGIONAL: http://www.massdot.state.ma.us/developersdata.aspx
    ###########################
    ma_feeds = {'mbta': 'http://www.mbta.com/uploadedfiles/MBTA_GTFS.zip',
                'berkshire': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/brta_google_transit.zip',
                'brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/bat_google_transit.zip',
                'cape_ann': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/cata_google_transit.zip',
                'cape_cod': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ccrta_google_transit.zip',
                'franklin': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/frta_google_transit.zip',
                'attleboro': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/gatra_google_transit.zip',
                'lowell': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/lrta_google_transit.zip',
                'merrimack': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mvrta_google_transit.zip',
                'metrowest': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mwrta_google_transit.zip',
                'montachusett': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mart_google_transit.zip',
                'nantucket': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/nrta_google_transit.zip',
                'pioneer': 'http://www.pvta.com/g_trans/google_transit.zip',
                'southeastern': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/srta_google_transit.zip',
                'vineyard': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/vta_google_transit.zip',
                'worchester': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/wrta_google_transit.zip',
                'ferries': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ferries_google_transit.zip'
    }

    # The feeds for the private bus companies are full of errors; omitting these for now.
    # 'bloom_tours': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/Bloom_google_transit.zip',
    # 'boston_express': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/boston_express_google_transit.zip',
    # 'coach_bus': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/coach_google_transit.zip',
    # 'dattco': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/dattco_google_transit.zip',
    # 'peter_pan': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/peter_pan_google_transit.zip',
    # 'plymouth_brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/PB_google_transit.zip',
    # 'yankee': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/yankee_google_transit.zip'

    for source in ma_feeds:
        print('Going to check for %s feed...' % source)
        filename = '%s.zip' % source
        self.get_stream(ma_feeds[source], filename, do_stream=True)

    ###########################

    print("Downloading finished.  Writing time check file...")
    tcf = open(time_check, 'wb')
    pickle.dump(self.tc, tcf)
    tcf.close()
    print("Time check file written.")
    print("Writing 'new_use.csv', file of validated new downloads...")
    nu = open('new_use.csv', 'wb')
    nuw = csv.writer(nu)
    for n in self.new_use:
      print("Got new GTFS " + n)
      nuw.writerow([n])

    nu.close()
    print("Done writing 'new_use.csv'.")
    print("All done!")
  ###############################
	#!/usr/bin/python

	import requests, os, pickle, datetime, zipfile, subprocess, csv


	class FeedFetcher():
	def __init__(self, ddir=os.getcwd()):
	self.ddir = ddir
	self.tc = {} # time checks for GTFS fetches
	self.new_use = [] # list of new feeds successfully downloaded and validated

	def verify(self, file_name):
	# file_name is local to download directory
	f = os.path.join(self.ddir, file_name)
	if not os.path.isfile(f):
	print("File " + f + " not found; cannot verify it.")
	return False

	print("Validating feed in " + file_name + "...")
	try:
	p = subprocess.Popen(['feedvalidator.py', '--output=CONSOLE',
	'-m', '-n', f], stdout=subprocess.PIPE)
	out = p.communicate()
	res = out[0].split('\n')
	errct = res[-2:-1][0] # output line with count of errors/warnings
	if errct.find('error') > -1:
	print("Feed validator found errors in " + file_name + ": " + errct + ".")
	return False
	elif out[0].find('this feed is in the future,') > -1:
	print("Feed validator found GTFS not in service until future for " +
	file_name + ".")
	return False
	else:
	if errct.find('successfully') > -1:
	print("Feed " + file_name + " looks great: " + errct + ".")
	else:
	# have warnings
	print("Feed " + file_name + " looks ok: " + errct[7:] + ".")
	return True
	except:
	print("Failed to run feed validator on GTFS " + file_name + ".")
	return False

	print("How did we get here? In GTFS validation for " + file_name + ".")
	return False # shouldn't get here

	def verify_all(self):
	# just run verify on all feeds in log file
	time_check = os.path.join(self.ddir, 'time_checks.p')
	if os.path.isfile(time_check):
	tcf = open(time_check, 'rb')
	self.tc = pickle.load(tcf)
	tcf.close()
	print("Loaded time check file.")
	if self.tc.has_key('last_check'):
	last_check = self.tc['last_check']
	print("Last check: ")
	print(last_check)
	timedelt = datetime.datetime.now() - last_check
	print("Time since last check: " )
	print(timedelt)
	else:
	print("Couldn't find last check time in log file; that's odd.")
	gtfs = self.tc
	del gtfs['last_check']
	for g in gtfs:
	if g.endswith('.zip'):
	if g.startswith('septa'):
	self.verify('google_bus.zip')
	self.verify('google_rail.zip')
	else:
	self.verify(g)
	else:
	print("What is " + g + "? That doesn't look like a GTFS file name.")
	print("All done verifying!")
	else:
	print("No log file found! Can't verify GTFS.")

	def check_header_newer(self, url, file_name):
	# return 1 if newer file available to download;
	# return 0 if info missing;
	# return -1 if current file is most recent.
	if self.tc.has_key(file_name):
	last_info = self.tc.get(file_name)
	hdr = requests.head(url)
	hdr = hdr.headers
	if hdr.get('last-modified'):
	last_mod = hdr.get('last-modified')
	if last_mod == last_info:
	print("No new download available for " + file_name + ".")
	return -1
	else:
	print("New download available for " + file_name + ".")
	print("Last downloaded: " + last_info + ".")
	print("New download posted: " + last_mod + ".")
	return 1
	else:
	print("No last-modified header set for " + file_name + " download link.")
	return 0
	else:
	print("Time check entry for " + file_name + " not found.")
	return 0

	# shouldn't happen
	print("How did we get here? Failed checking header info.")
	return 0

	def get_stream(self, url, file_name, do_stream=True, session=None, do_verify=True):
	if self.check_header_newer(url, file_name) == -1:
	return False
	# file_name is local to download directory
	f = os.path.join(self.ddir, file_name)
	print("Getting file " + f + "...")
	if not session:
	stream = requests.get(url, stream=do_stream)
	else:
	stream = session.get(url, stream=do_stream)

	if stream.ok:
	stream_file = open(f, 'wb')
	if do_stream:
	for chunk in stream.iter_content():
	stream_file.write(chunk)
	else:
	stream_file.write(stream.content)

	stream_file.close()
	info = os.stat(f)
	if info.st_size < 10000:
	# file smaller than 10K may not be a GTFS; just warn
	print('Warning:')
	print("Download for " + f + " is only " + str(info.st_size) + " bytes.")
	print("It may not be a valid GTFS.")
	if not zipfile.is_zipfile(f):
	print("BAD DOWNLOAD FOR " + f + ".")
	print("Download for " + f + " is not a zip file.")
	return False
	if stream.headers.get('last-modified'):
	self.tc[file_name] = stream.headers.get('last-modified')
	else:
	# format like last-modified header
	self.tc[file_name] = datetime.datetime.utcnow(
	).strftime("%a, %d %b %Y %H:%M:%S GMT")
	print("Download completed successfully.")
	# verify download
	if do_verify:
	if self.verify(file_name):
	print("GTFS verification succeeded.")
	self.new_use.append(file_name)
	return True
	else:
	print("GTFS verification failed.")
	return False
	else:
	print("Skipping GTFS verification in get_stream.")
	# not adding to new_use here; do elsewhere
	return True
	else:
	print("DOWNLOAD FAILED FOR " + f + ".")

	return False

	def fetch(self):
	# pickled log of last times downloaded
	time_check = os.path.join(self.ddir, 'time_checks.p')
	if os.path.isfile(time_check):
	tcf = open(time_check, 'rb')
	self.tc = pickle.load(tcf)
	tcf.close()
	print("Loaded time check file.")
	if self.tc.has_key('last_check'):
	last_check = self.tc['last_check']
	print("Last check: ")
	print(last_check)
	timedelt = datetime.datetime.now() - last_check
	print("Time since last check: " )
	print(timedelt)
	else:
	print("Will create new time check file.")

	self.tc['last_check'] = datetime.datetime.now()

	####### MBTA AND MA REGIONAL #######
	# MBTA: http://www.mbta.com/rider_tools/developers/
	# MA REGIONAL: http://www.massdot.state.ma.us/developersdata.aspx
	###########################
	ma_feeds = {'mbta': 'http://www.mbta.com/uploadedfiles/MBTA_GTFS.zip',
	'berkshire': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/brta_google_transit.zip',
	'brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/bat_google_transit.zip',
	'cape_ann': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/cata_google_transit.zip',
	'cape_cod': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ccrta_google_transit.zip',
	'franklin': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/frta_google_transit.zip',
	'attleboro': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/gatra_google_transit.zip',
	'lowell': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/lrta_google_transit.zip',
	'merrimack': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mvrta_google_transit.zip',
	'metrowest': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mwrta_google_transit.zip',
	'montachusett': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/mart_google_transit.zip',
	'nantucket': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/nrta_google_transit.zip',
	'pioneer': 'http://www.pvta.com/g_trans/google_transit.zip',
	'southeastern': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/srta_google_transit.zip',
	'vineyard': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/vta_google_transit.zip',
	'worchester': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/wrta_google_transit.zip',
	'ferries': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/ferries_google_transit.zip'
	}

	# The feeds for the private bus companies are full of errors; omitting these for now.
	# 'bloom_tours': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/Bloom_google_transit.zip',
	# 'boston_express': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/boston_express_google_transit.zip',
	# 'coach_bus': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/coach_google_transit.zip',
	# 'dattco': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/dattco_google_transit.zip',
	# 'peter_pan': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/peter_pan_google_transit.zip',
	# 'plymouth_brockton': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/PB_google_transit.zip',
	# 'yankee': 'http://www.massdot.state.ma.us/Portals/0/docs/developers/yankee_google_transit.zip'

	for source in ma_feeds:
	print('Going to check for %s feed...' % source)
	filename = '%s.zip' % source
	self.get_stream(ma_feeds[source], filename, do_stream=True)

	###########################

	print("Downloading finished. Writing time check file...")
	tcf = open(time_check, 'wb')
	pickle.dump(self.tc, tcf)
	tcf.close()
	print("Time check file written.")
	print("Writing 'new_use.csv', file of validated new downloads...")
	nu = open('new_use.csv', 'wb')
	nuw = csv.writer(nu)
	for n in self.new_use:
	print("Got new GTFS " + n)
	nuw.writerow([n])

	nu.close()
	print("Done writing 'new_use.csv'.")
	print("All done!")
	###############################