Skip to content

Instantly share code, notes, and snippets.

@emilyeros
Last active February 12, 2022 00:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save emilyeros/37589878a7bccaf4a1d31d3d340c2ad3 to your computer and use it in GitHub Desktop.
Save emilyeros/37589878a7bccaf4a1d31d3d340c2ad3 to your computer and use it in GitHub Desktop.
# code source: GIS stack exchange: https://gis.stackexchange.com/questions/266897/how-to-get-around-the-1000-objectids-limit-on-arcgis-server?newreg=e085133f1a9d475b99c094e1c497a2a5
import numpy as np
import pandas as pd
import geopandas as gpd
import urllib.parse
import requests
def query_arcgis_feature_server(url_feature_server=''):
'''
This function downloads all of the features available on a given ArcGIS
feature server. The function is written to bypass the limitations imposed
by the online service, such as only returning up to 1,000 or 2,000 featues
at a time.
Parameters
----------
url_feature_server : string
String containing the URL of the service API you want to query. It should
end in a forward slash and look something like this:
'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'
EV Justice40 URL is:
'https://services2.arcgis.com/spDVOHcj11Fk80s4/ArcGIS/rest/services/DAC6/FeatureServer/4/'
Returns
-------
geodata_final : gpd.GeoDataFrame
This is a GeoDataFrame that contains all of the features from the
Feature Server. After calling this function, the `geodata_final` object
can be used to store the data on disk in several different formats
including, but not limited to, Shapefile (.shp), GeoJSON (.geojson),
GeoPackage (.gpkg), or PostGIS.
See https://geopandas.org/en/stable/docs/user_guide/io.html#writing-spatial-data
for more details.
'''
if url_feature_server == '':
geodata_final = gpd.GeoDataFrame()
return geodata_final
# Fixing last character in case the URL provided didn't end in a
# forward slash
if url_feature_server[-1] != '/':
url_feature_server = url_feature_server + '/'
# Getting the layer definitions. This contains important info such as the
# name of the column used as feature_ids/object_ids, among other things.
layer_def = requests.get(url_feature_server + '?f=pjson').json()
# The `objectIdField` is the column name used for the
# feature_ids/object_ids
fid_colname = layer_def['objectIdField']
# The `maxRecordCount` tells us the maximum number of records this REST
# API service can return at once. The code below is written such that we
# perform multiple calls to the API, each one being short enough never to
# go beyond this limit.
record_count_max = layer_def['maxRecordCount']
# Part of the URL that specifically requests only the object IDs
url_query_get_ids = (f'query?f=geojson&returnIdsOnly=true'
f'&where={fid_colname}+is+not+null')
url_comb = url_feature_server + url_query_get_ids
# Getting all the object IDs
service_request = requests.get(url_comb)
all_objectids = np.sort(service_request.json()['properties']['objectIds'])
# This variable will store all the parts of the multiple queries. These
# parts will, at the end, be concatenated into one large GeoDataFrame.
geodata_parts = []
# This part of the query is fixed and never actually changes
url_query_fixed = ('query?f=geojson&outFields=*&where=')
# Identifying the largest query size allowed per request. This will dictate
# how many queries will need to be made. We start the search at
# the max record count, but that generates errors sometimes - the query
# might time out because it's too big. If the test query times out, we try
# shrink the query size until the test query goes through without
# generating a time-out error.
block_size = min(record_count_max, len(all_objectids))
worked = False
while not worked:
# Moving the "cursors" to their appropriate locations
id_start = all_objectids[0]
id_end = all_objectids[block_size-1]
readable_query_string = (f'{fid_colname}>={id_start} '
f'and {fid_colname}<={id_end}')
url_query_variable = urllib.parse.quote(readable_query_string)
url_comb = url_feature_server + url_query_fixed + url_query_variable
url_get = requests.get(url_comb)
if 'error' in url_get.json():
block_size = int(block_size/2)+1
else:
geodata_part = gpd.read_file(url_get.text)
geodata_parts.append(geodata_part.copy())
worked = True
# Performing the actual query to the API multiple times. This skips the
# first few rows/features in the data because those rows were already
# captured in the query performed in the code chunk above.
for i in range(block_size, len(all_objectids), block_size):
# Moving the "cursors" to their appropriate locations and finding the
# limits of each block
sub_list = all_objectids[i:i + block_size]
id_start = sub_list[0]
id_end = sub_list[-1]
readable_query_string = (f'{fid_colname}>={id_start} '
f'and {fid_colname}<={id_end}')
# Encoding from readable text to URL
url_query_variable = urllib.parse.quote(readable_query_string)
# Constructing the full request URL
url_comb = url_feature_server + url_query_fixed + url_query_variable
# Actually performing the query and storing its results in a
# GeoDataFrame
geodata_part = (gpd.read_file(url_comb,
driver='GeoJSON'))
# Appending the result to `geodata_parts`
if geodata_part.shape[0] > 0:
geodata_parts.append(geodata_part)
# Concatenating all of the query parts into one large GeoDataFrame
geodata_final = (pd.concat(geodata_parts,
ignore_index=True)
.sort_values(by=fid_colname)
.reset_index(drop=True))
# Checking if any object ID is missing
ids_queried = set(geodata_final[fid_colname])
for i,this_id in enumerate(all_objectids):
if this_id not in ids_queried:
print('WARNING! The following ObjectID is missing from the final '
f'GeoDataFrame: ObjectID={this_id}')
pass
# Checking if any object ID is included twice
geodata_temp = geodata_final[[fid_colname]].copy()
geodata_temp['temp'] = 1
geodata_temp = (geodata_temp
.groupby(fid_colname)
.agg({'temp':'sum'})
.reset_index())
geodata_temp = geodata_temp.loc[geodata_temp['temp']>1].copy()
for i,this_id in enumerate(geodata_temp[fid_colname].values):
n_times = geodata_temp['temp'].values[i]
print('WARNING! The following ObjectID is included multiple times in'
f'the final GeoDataFrame: ObjectID={this_id}\tOccurrences={n_times}')
return geodata_final
# You just need to use the query_arcgis_feature_server function defined above and it will download all of the data on the feature server. It takes care of a couple of pitfalls, such as avoiding the maximum feature limit, or requests that are too big and just time out.
'''
Here's an example of how to download the data and store it into a shapefile on your disk:
url = 'https://services.arcgis.com/P3ePLMYs2RVChkJx/arcgis/rest/services/USA_Counties/FeatureServer/0/'
usa_counties_gdf = query_arcgis_feature_server(url)
usa_counties_gdf.to_file('usa_counties.shp')
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment