emilyeros/arcgis_rest_bigquery.py

## arcgis_rest_bigquery.py
# code source: GIS stack exchange: https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server?newreg=e085133f1a9d475b99c094e1c497a2a5

import numpy as np
import pandas as pd
import geopandas as gpd
import urllib.parse
import requests

def query_arcgis_feature_server(url_feature_server=''):
    '''
    This function downloads all of the features available on a given ArcGIS
    feature server. The function is written to bypass the limitations imposed
    by the online service, such as only returning up to 1,000 or 2,000 featues
    at a time.

    Parameters
    ----------
    url_feature_server : string
        String containing the URL of the service API you want to query. It should
        end in a forward slash and look something like this:
        'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'

        EV Justice40 URL is:
        'https://services2.arcgis.com/spDVOHcj11Fk80s4/ArcGIS/rest/services/DAC6/FeatureServer/4/'

    Returns
    -------
    geodata_final : gpd.GeoDataFrame
        This is a GeoDataFrame that contains all of the features from the
        Feature Server. After calling this function, the `geodata_final` object
        can be used to store the data on disk in several different formats
        including, but not limited to, Shapefile (.shp), GeoJSON (.geojson),
        GeoPackage (.gpkg), or PostGIS.
        See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data
        for more details.

    '''
    if url_feature_server == '':
        geodata_final = gpd.GeoDataFrame()
        return geodata_final

    # Fixing last character in case the URL provided didn't end in a
    # forward slash
    if url_feature_server[-1] != '/':
        url_feature_server = url_feature_server + '/'

    # Getting the layer definitions. This contains important info such as the
    # name of the column used as feature_ids/object_ids, among other things.
    layer_def = requests.get(url_feature_server + '?f=pjson').json()

    # The `objectIdField` is the column name used for the
    # feature_ids/object_ids
    fid_colname = layer_def['objectIdField']

    # The `maxRecordCount` tells us the maximum number of records this REST
    # API service can return at once. The code below is written such that we
    # perform multiple calls to the API, each one being short enough never to
    # go beyond this limit.
    record_count_max = layer_def['maxRecordCount']

    # Part of the URL that specifically requests only the object IDs
    url_query_get_ids = (f'query?f=geojson&returnIdsOnly=true'
                         f'&where={fid_colname}+is+not+null')

    url_comb = url_feature_server + url_query_get_ids

    # Getting all the object IDs
    service_request = requests.get(url_comb)
    all_objectids = np.sort(service_request.json()['properties']['objectIds'])

    # This variable will store all the parts of the multiple queries. These
    # parts will, at the end, be concatenated into one large GeoDataFrame.
    geodata_parts = []

    # This part of the query is fixed and never actually changes
    url_query_fixed = ('query?f=geojson&outFields=*&where=')

    # Identifying the largest query size allowed per request. This will dictate
    # how many queries will need to be made. We start the search at
    # the max record count, but that generates errors sometimes - the query
    # might time out because it's too big. If the test query times out, we try
    # shrink the query size until the test query goes through without
    # generating a time-out error.
    block_size = min(record_count_max, len(all_objectids))
    worked = False
    while not worked:
        # Moving the "cursors" to their appropriate locations
        id_start = all_objectids[0]
        id_end = all_objectids[block_size-1]

        readable_query_string = (f'{fid_colname}>={id_start} '
                                 f'and {fid_colname}<={id_end}')

        url_query_variable =  urllib.parse.quote(readable_query_string)

        url_comb = url_feature_server + url_query_fixed + url_query_variable

        url_get = requests.get(url_comb)

        if 'error' in url_get.json():
            block_size = int(block_size/2)+1
        else:
            geodata_part = gpd.read_file(url_get.text)

            geodata_parts.append(geodata_part.copy())
            worked = True

    # Performing the actual query to the API multiple times. This skips the
    # first few rows/features in the data because those rows were already
    # captured in the query performed in the code chunk above.
    for i in range(block_size, len(all_objectids), block_size):
        # Moving the "cursors" to their appropriate locations and finding the
        # limits of each block
        sub_list = all_objectids[i:i + block_size]
        id_start = sub_list[0]
        id_end = sub_list[-1]

        readable_query_string = (f'{fid_colname}>={id_start} '
                                 f'and {fid_colname}<={id_end}')

        # Encoding from readable text to URL
        url_query_variable =  urllib.parse.quote(readable_query_string)

        # Constructing the full request URL
        url_comb = url_feature_server + url_query_fixed + url_query_variable

        # Actually performing the query and storing its results in a
        # GeoDataFrame
        geodata_part =  (gpd.read_file(url_comb,
                                       driver='GeoJSON'))

        # Appending the result to `geodata_parts`
        if geodata_part.shape[0] > 0:
            geodata_parts.append(geodata_part)

    # Concatenating all of the query parts into one large GeoDataFrame
    geodata_final = (pd.concat(geodata_parts,
                               ignore_index=True)
                     .sort_values(by=fid_colname)
                     .reset_index(drop=True))

    # Checking if any object ID is missing
    ids_queried = set(geodata_final[fid_colname])
    for i,this_id in enumerate(all_objectids):
        if this_id not in ids_queried:
            print('WARNING! The following ObjectID is missing from the final '
                  f'GeoDataFrame: ObjectID={this_id}')
            pass

    # Checking if any object ID is included twice
    geodata_temp = geodata_final[[fid_colname]].copy()
    geodata_temp['temp'] = 1
    geodata_temp = (geodata_temp
                    .groupby(fid_colname)
                    .agg({'temp':'sum'})
                    .reset_index())
    geodata_temp = geodata_temp.loc[geodata_temp['temp']>1].copy()
    for i,this_id in enumerate(geodata_temp[fid_colname].values):
        n_times = geodata_temp['temp'].values[i]
        print('WARNING! The following ObjectID is included multiple times in'
              f'the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}')

    return geodata_final


#    You just need to use the query_arcgis_feature_server function defined above and it will download all of the data on the feature server. It takes care of a couple of pitfalls, such as avoiding the maximum feature limit, or requests that are too big and just time out.

'''
Here's an example of how to download the data and store it into a shapefile on your disk:

url = 'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'

usa_counties_gdf = query_arcgis_feature_server(url)

usa_counties_gdf.to_file('usa_counties.shp')

'''
	# code source: GIS stack exchange: https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server?newreg=e085133f1a9d475b99c094e1c497a2a5

	import numpy as np
	import pandas as pd
	import geopandas as gpd
	import urllib.parse
	import requests

	def query_arcgis_feature_server(url_feature_server=''):
	'''
	This function downloads all of the features available on a given ArcGIS
	feature server. The function is written to bypass the limitations imposed
	by the online service, such as only returning up to 1,000 or 2,000 featues
	at a time.

	Parameters
	----------
	url_feature_server : string
	String containing the URL of the service API you want to query. It should
	end in a forward slash and look something like this:
	'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'

	EV Justice40 URL is:
	'https://services2.arcgis.com/spDVOHcj11Fk80s4/ArcGIS/rest/services/DAC6/FeatureServer/4/'

	Returns
	-------
	geodata_final : gpd.GeoDataFrame
	This is a GeoDataFrame that contains all of the features from the
	Feature Server. After calling this function, the `geodata_final` object
	can be used to store the data on disk in several different formats
	including, but not limited to, Shapefile (.shp), GeoJSON (.geojson),
	GeoPackage (.gpkg), or PostGIS.
	See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data
	for more details.

	'''
	if url_feature_server == '':
	geodata_final = gpd.GeoDataFrame()
	return geodata_final

	# Fixing last character in case the URL provided didn't end in a
	# forward slash
	if url_feature_server[-1] != '/':
	url_feature_server = url_feature_server + '/'

	# Getting the layer definitions. This contains important info such as the
	# name of the column used as feature_ids/object_ids, among other things.
	layer_def = requests.get(url_feature_server + '?f=pjson').json()

	# The `objectIdField` is the column name used for the
	# feature_ids/object_ids
	fid_colname = layer_def['objectIdField']

	# The `maxRecordCount` tells us the maximum number of records this REST
	# API service can return at once. The code below is written such that we
	# perform multiple calls to the API, each one being short enough never to
	# go beyond this limit.
	record_count_max = layer_def['maxRecordCount']

	# Part of the URL that specifically requests only the object IDs
	url_query_get_ids = (f'query?f=geojson&returnIdsOnly=true'
	f'&where={fid_colname}+is+not+null')

	url_comb = url_feature_server + url_query_get_ids

	# Getting all the object IDs
	service_request = requests.get(url_comb)
	all_objectids = np.sort(service_request.json()['properties']['objectIds'])

	# This variable will store all the parts of the multiple queries. These
	# parts will, at the end, be concatenated into one large GeoDataFrame.
	geodata_parts = []

	# This part of the query is fixed and never actually changes
	url_query_fixed = ('query?f=geojson&outFields=*&where=')

	# Identifying the largest query size allowed per request. This will dictate
	# how many queries will need to be made. We start the search at
	# the max record count, but that generates errors sometimes - the query
	# might time out because it's too big. If the test query times out, we try
	# shrink the query size until the test query goes through without
	# generating a time-out error.
	block_size = min(record_count_max, len(all_objectids))
	worked = False
	while not worked:
	# Moving the "cursors" to their appropriate locations
	id_start = all_objectids[0]
	id_end = all_objectids[block_size-1]

	readable_query_string = (f'{fid_colname}>={id_start} '
	f'and {fid_colname}<={id_end}')

	url_query_variable = urllib.parse.quote(readable_query_string)

	url_comb = url_feature_server + url_query_fixed + url_query_variable

	url_get = requests.get(url_comb)

	if 'error' in url_get.json():
	block_size = int(block_size/2)+1
	else:
	geodata_part = gpd.read_file(url_get.text)

	geodata_parts.append(geodata_part.copy())
	worked = True

	# Performing the actual query to the API multiple times. This skips the
	# first few rows/features in the data because those rows were already
	# captured in the query performed in the code chunk above.
	for i in range(block_size, len(all_objectids), block_size):
	# Moving the "cursors" to their appropriate locations and finding the
	# limits of each block
	sub_list = all_objectids[i:i + block_size]
	id_start = sub_list[0]
	id_end = sub_list[-1]

	readable_query_string = (f'{fid_colname}>={id_start} '
	f'and {fid_colname}<={id_end}')

	# Encoding from readable text to URL
	url_query_variable = urllib.parse.quote(readable_query_string)

	# Constructing the full request URL
	url_comb = url_feature_server + url_query_fixed + url_query_variable

	# Actually performing the query and storing its results in a
	# GeoDataFrame
	geodata_part = (gpd.read_file(url_comb,
	driver='GeoJSON'))

	# Appending the result to `geodata_parts`
	if geodata_part.shape[0] > 0:
	geodata_parts.append(geodata_part)

	# Concatenating all of the query parts into one large GeoDataFrame
	geodata_final = (pd.concat(geodata_parts,
	ignore_index=True)
	.sort_values(by=fid_colname)
	.reset_index(drop=True))

	# Checking if any object ID is missing
	ids_queried = set(geodata_final[fid_colname])
	for i,this_id in enumerate(all_objectids):
	if this_id not in ids_queried:
	print('WARNING! The following ObjectID is missing from the final '
	f'GeoDataFrame: ObjectID={this_id}')
	pass

	# Checking if any object ID is included twice
	geodata_temp = geodata_final[[fid_colname]].copy()
	geodata_temp['temp'] = 1
	geodata_temp = (geodata_temp
	.groupby(fid_colname)
	.agg({'temp':'sum'})
	.reset_index())
	geodata_temp = geodata_temp.loc[geodata_temp['temp']>1].copy()
	for i,this_id in enumerate(geodata_temp[fid_colname].values):
	n_times = geodata_temp['temp'].values[i]
	print('WARNING! The following ObjectID is included multiple times in'
	f'the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}')

	return geodata_final


	# You just need to use the query_arcgis_feature_server function defined above and it will download all of the data on the feature server. It takes care of a couple of pitfalls, such as avoiding the maximum feature limit, or requests that are too big and just time out.

	'''
	Here's an example of how to download the data and store it into a shapefile on your disk:

	url = 'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'

	usa_counties_gdf = query_arcgis_feature_server(url)

	usa_counties_gdf.to_file('usa_counties.shp')

	'''