ian-weisser/load_table

## load_table
def map_gtfs_table_to_dict(gtfs_file, table_name):
    """
    Read data from a GTFS table, map the data into a list or dict for
    easier iteration or searching or other use. Each line of the table
    is mapped into a separate subdict, with a unique key.

    Many GTFS tables include a unique value (Example: trips_id in the
    trips.txt table) that this function automatically uses as the
    dict key. If a tables has no unique key (Example: calendar_dates.txt),
    the system generates a unique key using an incrementing row counter.

    WARNING: This function may run out of memory on a _large_ table.
    Example: stop_times.txt from Chicago is routinely 30MB compressed,
    182MB uncompressed, and almost 1GB exploded into a dict in RAM by this
    function.

    For example, one classic way of mapping a GTFS table to a set of dicts:
        >>> gtfs_file  = zipfile.ZipFile('foo.gtfs', mode='r')
        >>> with gtfs_file.open('routes.txt', mode='r') as infile:
        >>>     lines = infile.read().decode('utf-8').split('\r\n')
        >>> gtfs.close()
        >>> columns = len(lines[0].split(','))
        >>>
        >>> routes = {}
        >>> for line in lines[1:]:
        >>>     if len(line.split(',')) < columns:
        >>>         continue
        >>>     route_id         = line.split(',')[0].strip()
        >>>     route_short_name = line.split(',')[1].strip()
        >>>     route_long_name  = line.split(',')[2]
        >>>     routes[route_id] = {'route_short_name':route_short_name,
        >>>                         'route_long_name' :route_long_name }

    Is much simpler using this function:
        >>> gtfs_file    = zipfile.ZipFile('foo.gtfs', mode='r')
        >>> routes_table = map_gtfs_table_to_dict(gtfs_file, 'routes.txt')
        >>> gtfs.close()
        >>> do_something_with(routes_table)


    Inputs:
    gtfs_file should be a zipfile object, not a file path or raw data:
      gtfs_file = zipfile.ZipFile(gtfs_path, mode='r')

    table_name should match one within the zipfile. It should match one of:
      valid_names = gtfs_file.namelist()


    Output is a bunch of dicts (one dict per data line) nested within
      a container dict.

    Example GTFS data:
    service_id,monday,tuesday,wednesday,thursday,friday,saturday,
      sunday,start_date,end_date
    43301,1,1,1,1,1,0,0,20140515,20140518
    43302,1,1,1,1,0,0,0,20140515,20140518
    43303,0,1,1,1,1,0,0,20140515,20140518

    Example usage:
    >>> import zipfile
    >>> gtfs_path      = '20140515.cta.gtfs'
    >>> gtfs_file      = zipfile.ZipFile(gtfs_path, mode='r')
    >>> table_name     = 'calendar.txt'
    >>> map_gtfs_table_to_dict(gtfs_file, table_name)
    {'43301': {'monday':'1', 'tuesday':'1', ... },
     '43302': {'monday':'1', 'tuesday':'1', ... },
     '43303': {'monday':'0', 'tuesday':'1', ... }, }
    """

    table_data = {}

    # Read the table file

    with gtfs_file.open(table_name, mode='r') as infile:
        lines_string = infile.read().decode('utf-8')

    # Some GTFS makers use different line endings

    if '\r\n' in lines_string:
        lines = lines_string.split('\r\n')
    else:
        lines = lines_string.split('\n')

    # Parse the header, mapping columns heading to the index() of the lines

    columns = {}
    header = lines[0].split(',')
    for field_name in header:
        columns[field_name] = header.index(field_name)

    # The key is based on the table name

    keys = {
        'agency.txt'        :'agency_id', 'calendar.txt'       :'service_id',
        'calendar_dates.txt': None,       'fare_attributes.txt':'fare_id',
        'fare_rules.txt'    : None,       'feed_info.txt'      : None,
        'frequencies.txt'   : None,       'routes.txt'         :'route_id',
        'shapes.txt'        :'shape_id',  'stops.txt'          :'stop_id',
        'stop_times.txt'    : None,       'transfers.txt'      : None,
        'trips.txt'         :'trip_id' }

    if keys[table_name] is None:     # Generate key

        counter = -1


    # Iterate through each line of data, converting line into dict

    for line in lines[1:]:

        if len(line.split(',')) < len(columns):  # Non-data lines
            continue

        # Create the dict of each line's data

        line_dict = {}
        for column in columns:
            line_dict[column] = line.split(',')[columns[column]].strip('" ')

        # Set the key to the line_dict, and add it to the main dict

        if keys[table_name] is not None:  # Has key

            key_value = line_dict[keys[table_name]]
            del line_dict[keys[table_name]]
            table_data.update({ key_value : line_dict})

        else:                             # Generate key

            counter += 1
            table_data.update({ counter : line_dict})

    return table_data
	def map_gtfs_table_to_dict(gtfs_file, table_name):
	"""
	Read data from a GTFS table, map the data into a list or dict for
	easier iteration or searching or other use. Each line of the table
	is mapped into a separate subdict, with a unique key.

	Many GTFS tables include a unique value (Example: trips_id in the
	trips.txt table) that this function automatically uses as the
	dict key. If a tables has no unique key (Example: calendar_dates.txt),
	the system generates a unique key using an incrementing row counter.

	WARNING: This function may run out of memory on a _large_ table.
	Example: stop_times.txt from Chicago is routinely 30MB compressed,
	182MB uncompressed, and almost 1GB exploded into a dict in RAM by this
	function.

	For example, one classic way of mapping a GTFS table to a set of dicts:
	>>> gtfs_file = zipfile.ZipFile('foo.gtfs', mode='r')
	>>> with gtfs_file.open('routes.txt', mode='r') as infile:
	>>> lines = infile.read().decode('utf-8').split('\r\n')
	>>> gtfs.close()
	>>> columns = len(lines[0].split(','))
	>>>
	>>> routes = {}
	>>> for line in lines[1:]:
	>>> if len(line.split(',')) < columns:
	>>> continue
	>>> route_id = line.split(',')[0].strip()
	>>> route_short_name = line.split(',')[1].strip()
	>>> route_long_name = line.split(',')[2]
	>>> routes[route_id] = {'route_short_name':route_short_name,
	>>> 'route_long_name' :route_long_name }

	Is much simpler using this function:
	>>> gtfs_file = zipfile.ZipFile('foo.gtfs', mode='r')
	>>> routes_table = map_gtfs_table_to_dict(gtfs_file, 'routes.txt')
	>>> gtfs.close()
	>>> do_something_with(routes_table)


	Inputs:
	gtfs_file should be a zipfile object, not a file path or raw data:
	gtfs_file = zipfile.ZipFile(gtfs_path, mode='r')

	table_name should match one within the zipfile. It should match one of:
	valid_names = gtfs_file.namelist()


	Output is a bunch of dicts (one dict per data line) nested within
	a container dict.

	Example GTFS data:
	service_id,monday,tuesday,wednesday,thursday,friday,saturday,
	sunday,start_date,end_date
	43301,1,1,1,1,1,0,0,20140515,20140518
	43302,1,1,1,1,0,0,0,20140515,20140518
	43303,0,1,1,1,1,0,0,20140515,20140518

	Example usage:
	>>> import zipfile
	>>> gtfs_path = '20140515.cta.gtfs'
	>>> gtfs_file = zipfile.ZipFile(gtfs_path, mode='r')
	>>> table_name = 'calendar.txt'
	>>> map_gtfs_table_to_dict(gtfs_file, table_name)
	{'43301': {'monday':'1', 'tuesday':'1', ... },
	'43302': {'monday':'1', 'tuesday':'1', ... },
	'43303': {'monday':'0', 'tuesday':'1', ... }, }
	"""

	table_data = {}

	# Read the table file

	with gtfs_file.open(table_name, mode='r') as infile:
	lines_string = infile.read().decode('utf-8')

	# Some GTFS makers use different line endings

	if '\r\n' in lines_string:
	lines = lines_string.split('\r\n')
	else:
	lines = lines_string.split('\n')

	# Parse the header, mapping columns heading to the index() of the lines

	columns = {}
	header = lines[0].split(',')
	for field_name in header:
	columns[field_name] = header.index(field_name)

	# The key is based on the table name

	keys = {
	'agency.txt' :'agency_id', 'calendar.txt' :'service_id',
	'calendar_dates.txt': None, 'fare_attributes.txt':'fare_id',
	'fare_rules.txt' : None, 'feed_info.txt' : None,
	'frequencies.txt' : None, 'routes.txt' :'route_id',
	'shapes.txt' :'shape_id', 'stops.txt' :'stop_id',
	'stop_times.txt' : None, 'transfers.txt' : None,
	'trips.txt' :'trip_id' }

	if keys[table_name] is None: # Generate key

	counter = -1



	# Iterate through each line of data, converting line into dict

	for line in lines[1:]:

	if len(line.split(',')) < len(columns): # Non-data lines
	continue

	# Create the dict of each line's data

	line_dict = {}
	for column in columns:
	line_dict[column] = line.split(',')[columns[column]].strip('" ')

	# Set the key to the line_dict, and add it to the main dict

	if keys[table_name] is not None: # Has key

	key_value = line_dict[keys[table_name]]
	del line_dict[keys[table_name]]
	table_data.update({ key_value : line_dict})

	else: # Generate key

	counter += 1
	table_data.update({ counter : line_dict})

	return table_data