Skip to content

Instantly share code, notes, and snippets.

@jasonlally
Last active December 18, 2015 22:08
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jasonlally/5852216 to your computer and use it in GitHub Desktop.
Save jasonlally/5852216 to your computer and use it in GitHub Desktop.
Loops through all the city data catalogs using Socrata and dumps them to a structured JSON document that can be read by a D3 tree map.
#Connects to the Socrata search API and loads data describing the tabular datasets in the catalog for use by D3 tree map
#This version connects to a list of existing Socrata instances and loops through the ones categorized as city
#Use: python dataportalapi.py > portaldata.json
import requests, json, math, re
def check_categories(d,category):
for i in range(len(d)):
if d[i]['name'] == category: return i
return -1
#TODO: load via standard csv file with Name, URL, Scale, and Catalog Type as fields
c = requests.get("https://opendata.socrata.com/resource/6wk3-4ija.json?$where=type='city'")
cities = c.json()
final = {"name" : "Socrata Data Portals", "children": []}
for city in cities:
#sURL = 'http://nycopendata.socrata.com'
sURL = city['open_data_site_url']['url'].strip('/')
out = []
page = 1
records = 0
total = 2
rwithdata = 0
while records < total:
payload = {'limit' : 100, 'page' : page, 'limitTo' : 'TABLES'}
r = requests.get(sURL + '/api/search/views.json', params=payload)
responses = r.json()
total = responses['count']
#had to add this check as nycopendata is somehow returning fewer results than it indicated in count
if 'results' in responses:
for response in responses['results']:
view = response['view']
records += 1
if len(view['columns']) != 0 and 'cachedContents' in view['columns'][0] and 'flags' in view:
#print view['flags']
rwithdata += 1
name = view['name']
vid = view['id']
views = view['viewCount']
size = view['columns'][0]['cachedContents']['non_null']
if size == 0:
size = 2 #probably should just skip these altogether, for now making them a tiny dataset so LOG(0) doesn't occur
logsize = math.log(size)
if 'category' in view:
category = view['category']
else:
category = "No Category"
if 'tags' in view:
for tag in view['tags']:
#tags aren't used in the json file yet, these could probably be used to do alternate visualizations or in a companion list, this is just a placeholder for now
foo = tag
index = check_categories(out,category)
url = sURL + '/d/' + vid
if index == -1:
out.append({"name": category, "children": [ {"name": name, "value": logsize, "url": url, "size": logsize } ] })
else:
out[index]["children"].append({"name": name, "value": logsize, "url": url, "size": logsize })
else:
records = total
page += 1
portaldata = {"name" : city['customer_name'] + " Data Portal", "count" : rwithdata, "children" : out}
final["children"].append(portaldata)
print json.dumps(final)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment