melanieimfeld/getdata.py

## getdata.py
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon, shape
from sodapy import Socrata
import os

#Run this file to get the data from open data NYC
#Below are the details, such as identifiers, for the datasets we want to download. Change limit to get a smaller subset


querylist = [
{'identifier':'myk6-g6eq','limit': 10000, 'queries' : [["The_geom"], [""]], 'tolerance': 0.0001, 'geom' : 'The_geom'}, #floodplains
{'identifier':'jp9i-3b7y','limit': 100, 'queries' : [["The_geom,boro_cd"],[" "]], 'tolerance': 0, 'geom' : 'The_geom'}, #community districts
{'identifier':'xi7c-iiu2','limit': 100, 'queries' : [["*"],[" "]], 'tolerance': None, 'geom' : None}, #population
{'identifier':'qjdb-bxqz','limit': 1000000, 'queries' : [["CNSTRCT_YR,the_geom"],["FEAT_CODE==2100"]], 'tolerance': 0.0001, 'geom' : 'the_geom'}, #buildings
]

outdir = "data"
# Create data folder if it does no exist
if not os.path.exists(outdir):
    os.makedirs(outdir)


def getBasedata(identifier, limit, queries, tolerance, geom):
    domain = 'data.cityofnewyork.us'
    client = Socrata(domain, None)
    results = client.get(identifier,limit=limit, select=queries[0], where=queries[1])
    results = pd.DataFrame.from_dict(results)

    if geom != None: #spatial files
    	gdf = createGdf(results,tolerance, geom)
    	gdf.to_file("{}/{}.geojson".format(outdir, identifier),driver='GeoJSON')
    else: #non spatial
    	results.to_csv("{}/{}.csv".format(outdir, identifier))
    print("download finished")


def createGdf(json,tolerance,name):
    geometry = [shape(x).simplify(tolerance, preserve_topology=True) for x in json[name]]
    json = json.drop(name, axis=1)
    gdf = gpd.GeoDataFrame(json, crs='EPSG:4326', geometry=geometry)
    gdf = gdf.to_crs(epsg=2263) #reproject to NAD83
    return gdf


for i in querylist:
	getBasedata(i["identifier"],i["limit"], i["queries"], i["tolerance"], i["geom"])

## main.py
import geopandas as gpd
import pandas as pd
from shapely.geometry import Polygon, shape

#Functions for the basic analysis. Only works if data was downloaded with getdata.py

floodplains = gpd.read_file("data/myk6-g6eq.geojson")
cd = gpd.read_file("data/jp9i-3b7y.geojson")
buildings = gpd.read_file("data/qjdb-bxqz.geojson")
pop = pd.read_csv("data/xi7c-iiu2.csv", index_col = 0)

def checkFloodplain(cd,fp):
    cd["boro_cd"] = cd["boro_cd"].astype(int)
    intersection = gpd.overlay(cd, fp, how='intersection')
    cd_subset = intersection.dissolve(by = 'boro_cd')
    merged = pd.merge(cd["boro_cd"], cd_subset, how = "left", on = "boro_cd")
    merged["contains_fldplain"] = merged.geometry.apply(lambda x: "No" if x==None else "Yes")
    merged[["boro_cd", "contains_fldplain"]].to_csv('data/output.csv')
    print("Files saved")

checkFloodplain(cd,floodplains)

#Functions for the extended analysis. It takes a bit of time, uncomment to run

def checkFloodplain2(cd,fp):
    cd["boro_cd"] = cd["boro_cd"].astype(int)
    intersection = gpd.overlay(cd, fp, how='intersection')
    cd_subset = intersection.dissolve(by = 'boro_cd', as_index = False)

    #get percentage data
    cd_subset["fld_area_sqkm"] = cd_subset.geometry.area/10764000
    cd["tot_area_sqkm"] = cd.geometry.area/10764000
    merged = pd.merge(cd.loc[:,cd.columns != "geometry"], cd_subset, how = "left", on = "boro_cd")
    merged["contains_fldplain"] = merged.geometry.apply(lambda x: "No" if x==None else "Yes")
    merged["perc_fldplain"] = (merged["fld_area_sqkm"] / merged["tot_area_sqkm"] * 100).fillna(0).round(1)
    print("percentage data added, wait...")

    #get building data
    merged = merged.merge(selectBuildings(cd_subset,buildings), on = "boro_cd", how = "left")
    merged["building_density"] = merged.index_right / merged.fld_area_sqkm
    print("building data added, wait...")

    #get population data
    merged = merged.merge(preparePopdata(pop), how="left", on="boro_cd")
    print("pop growth rate data added")

    export(merged)


def preparePopdata(pop):
	pop["boro_cd"] = pop.apply(lambda x: boroCD(x. borough, x.cd_number), axis = 1)
	pop["pop_growth_rate"] = (pop._2010_population -  pop._1970_population) / pop._1970_population * 100
	pop = pop[["boro_cd", "pop_growth_rate"]]
	return pop


def boroCD(b, no):
	if b == "Bronx":
	    return 200 + no
	elif b == "Brooklyn":
	    return 300 + no
	elif b == "Queens":
	    return 400 + no
	elif b == "Manhattan":
	    return 100 + no
	elif b == "Staten Island":
	    return 500 + no
	else:
	    return 0

def selectBuildings(subset, buildings):
	buildings.geometry = buildings.geometry.centroid
	buildings.CNSTRCT_YR = pd.to_numeric(buildings.CNSTRCT_YR).fillna(0).astype(int)
	b_intersection = gpd.sjoin(buildings, subset, op="within").dissolve(by = "boro_cd", aggfunc = {"CNSTRCT_YR":"median", "index_right": "count"})
	return b_intersection[["CNSTRCT_YR", "index_right"]]

def export(merged):
	merged.to_file("data/output.gpkg", layer='floodplains', driver="GPKG")
	print("Files saved")

#checkFloodplain2(cd,floodplains)
	import geopandas as gpd
	import pandas as pd
	from shapely.geometry import Polygon, shape
	from sodapy import Socrata
	import os

	#Run this file to get the data from open data NYC
	#Below are the details, such as identifiers, for the datasets we want to download. Change limit to get a smaller subset


	querylist = [
	{'identifier':'myk6-g6eq','limit': 10000, 'queries' : [["The_geom"], [""]], 'tolerance': 0.0001, 'geom' : 'The_geom'}, #floodplains
	{'identifier':'jp9i-3b7y','limit': 100, 'queries' : [["The_geom,boro_cd"],[" "]], 'tolerance': 0, 'geom' : 'The_geom'}, #community districts
	{'identifier':'xi7c-iiu2','limit': 100, 'queries' : [["*"],[" "]], 'tolerance': None, 'geom' : None}, #population
	{'identifier':'qjdb-bxqz','limit': 1000000, 'queries' : [["CNSTRCT_YR,the_geom"],["FEAT_CODE==2100"]], 'tolerance': 0.0001, 'geom' : 'the_geom'}, #buildings
	]

	outdir = "data"
	# Create data folder if it does no exist
	if not os.path.exists(outdir):
	os.makedirs(outdir)


	def getBasedata(identifier, limit, queries, tolerance, geom):
	domain = 'data.cityofnewyork.us'
	client = Socrata(domain, None)
	results = client.get(identifier,limit=limit, select=queries[0], where=queries[1])
	results = pd.DataFrame.from_dict(results)

	if geom != None: #spatial files
	gdf = createGdf(results,tolerance, geom)
	gdf.to_file("{}/{}.geojson".format(outdir, identifier),driver='GeoJSON')
	else: #non spatial
	results.to_csv("{}/{}.csv".format(outdir, identifier))
	print("download finished")


	def createGdf(json,tolerance,name):
	geometry = [shape(x).simplify(tolerance, preserve_topology=True) for x in json[name]]
	json = json.drop(name, axis=1)
	gdf = gpd.GeoDataFrame(json, crs='EPSG:4326', geometry=geometry)
	gdf = gdf.to_crs(epsg=2263) #reproject to NAD83
	return gdf


	for i in querylist:
	getBasedata(i["identifier"],i["limit"], i["queries"], i["tolerance"], i["geom"])