Last active
March 12, 2018 17:30
-
-
Save aschleg/54bf7ed55c2383f3ba1f338b8116a77b to your computer and use it in GitHub Desktop.
Simple script for extracting Socrata Open Data Access (SODA) datasets.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Simple script for extracting Socrata Open Data Access (SODA) datasets. Compatible with 3+, though one can easily make it 2.7 | |
compatible by changing the `from urllib.error import HTTPError` import to `from urllib2 import HTTPError` | |
Parameters | |
---------- | |
endpoint : string | |
SODA API endpoint of the dataset. | |
count : int, default 1000 | |
The number of records to return in each paged result. | |
offset : int, default 0 | |
Offset the results returned. For example, if offset = 1000, the script will extract all records after the 1,000th record. | |
return_df : bool, default True | |
Convert the returned SODA API dataset into a pandas DataFrame from JSON. | |
""" | |
import requests | |
import pandas as pd | |
from urllib.error import HTTPError # from urllib2 import HTTPError | |
def get_soda_api_data(endpoint, count=1000, offset=0, return_df=True): | |
params = {'$limit': count, '$offset': offset} | |
results = [] | |
while True: | |
try: | |
r = requests.get(endpoint, params=params) | |
rcontent = r.json() | |
if rcontent == []: | |
break | |
results.append(rcontent) | |
offset += count | |
params['$offset'] = offset | |
except HTTPError as err: | |
if err.response.status_code == '404': | |
break | |
else: | |
print(err.response.status_code) | |
if return_df: | |
results_df = pd.DataFrame() | |
for i in results: | |
results_df = results_df.append(pd.io.json.json_normalize(i)) | |
return results_df | |
else: | |
return results |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment