Skip to content

Instantly share code, notes, and snippets.

@eads
Last active May 31, 2017 11:35
Show Gist options
  • Save eads/cab99b13aad9bd18255c927a809c0d00 to your computer and use it in GitHub Desktop.
Save eads/cab99b13aad9bd18255c927a809c0d00 to your computer and use it in GitHub Desktop.
# NPR Jupyter utilities
import cachecontrol
import requests
import os
from cachecontrol import CacheControlAdapter
from cachecontrol.caches import FileCache
from cachecontrol.heuristics import ExpiresAfter
adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1))
sess = cachecontrol.CacheControl(requests.Session(), cache=FileCache('.cache'))
sess.mount('https://', adapter)
sess.mount('http://', adapter)
def get(url, outputfile=None, *args, **kwargs):
"""
Get a URL.
"""
r = sess.get(url, *args, **kwargs)
urlparts = r.url.split('/')
filename = urlparts[-1] or urlparts[-2]
if not r.from_cache:
if not os.path.exists('data/source'):
os.makedirs('data/source')
print('Got {0}'.format(r.url))
print('Writing {0}'.format(filename))
with open('data/source/{0}'.format(filename), 'wb') as f:
f.write(r.content)
else:
print('Skipping {0} (cache has not expired).'.format(filename))
return r
def get_socrata_data(url, *args, **kwargs):
params = {
"$limit": 50000,
"$offset": 0,
}
kwargs.update({'params': params})
results = []
while True:
response = get(url, *args, **kwargs)
response_data = response.json()
results += response_data
params["$offset"] += params["$limit"]
# If length of response is less than limit, we don't need to make
# another request.
if len(response_data) < params['$limit']:
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment