Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kuanb/75282d1ebe7f7996a132950ed6f9d87c to your computer and use it in GitHub Desktop.
Save kuanb/75282d1ebe7f7996a132950ed6f9d87c to your computer and use it in GitHub Desktop.
Run this script via: time python test_dask.py {desired_row_count_of_dfs}
import dask.dataframe as dd
import geopandas as gpd
import logging
import math
import numpy as np
import pandas as pd
import sys
from dask.distributed import Client
from shapely.wkt import loads as wkt_loads
from shapely.wkb import loads as wkb_loads
client = Client('dask-test.urbanfootprint.net:8786')
# client.restart()
print('Working with Client:', client)
# set length we will be using overall
desired_len = int(sys.argv[1])
print('\nDataframes set to be {} rows tall.'.format(desired_len))
# what's a good rule for setting partition sizes
# (e.g. how many per worker, what dataframe size?)
max_df_height = 50000
nparts = math.ceil(float(desired_len * desired_len)/max_df_height)
print('\nUsing {} partitions.'.format(nparts))
# create arbitrary pandas data frames
left = pd.DataFrame({'id': np.arange(desired_len)})
right = pd.DataFrame({'id': np.arange(desired_len)})
# give them both geometries (in this case all the same)
geom_left = wkt_loads('MULTIPOLYGON(((-89.382643392 43.0821049529998,-89.382037392 43.0826259539998,-89.3817993919999 43.0827739539998,-89.381381391 43.0832049529998,-89.3804783909999 43.0842209539998,-89.380055391 43.0846309539998,-89.3797773919999 43.0849769539998,-89.379714392 43.0851059539998,-89.379353392 43.0855149539999,-89.378870391 43.0860139539998,-89.3785403909999 43.0862379539999,-89.378378391 43.0861379539998,-89.377960391 43.0857929539998,-89.3783823909999 43.0855149539999,-89.378855392 43.0852019539999,-89.3789223909999 43.0851529539999,-89.379481391 43.0847459539998,-89.379782392 43.0845619539998,-89.379547391 43.0844029539998,-89.379303391 43.0842239539998,-89.379167391 43.0840939529998,-89.379033391 43.0840099539998,-89.378867392 43.0838709539999,-89.37882539 43.0838589529998,-89.379005391 43.0837359539999,-89.379765391 43.0831919539998,-89.380642391 43.0825829529998,-89.381193391 43.0821939539998,-89.381329391 43.0820919529998,-89.381794392 43.0817619529999,-89.3821393919999 43.0815249529998,-89.382395392 43.0813399529998,-89.383002391 43.0818109529998,-89.382643392 43.0821049529998)))')
left['geometry'] = [geom_left.wkb] * desired_len
geom_right = wkt_loads('MULTIPOLYGON(((-89.3842373909999 43.0714499509998,-89.384248392 43.0717639509999,-89.384249392 43.0719049509999,-89.384133391 43.0719079519998,-89.3840573909999 43.0719139509998,-89.383994391 43.0719409509998,-89.383842391 43.0720389509998,-89.383710391 43.0721349519998,-89.3833623899999 43.0723739509998,-89.3829343909999 43.0720539509998,-89.38260839 43.0718029509998,-89.38249539 43.0717259519998,-89.38268039 43.0715959509998,-89.383641391 43.0709269509998,-89.384293391 43.0704959509998,-89.3842373909999 43.0714499509998)))')
right['geometry'] = [geom_right.wkb] * desired_len
# give them both the same temp key
left = left.assign(tmp_key=0)
right = right.assign(tmp_key=0)
# then convert the left_gdf into a dask dataframe
ddf = dd.from_pandas(left.copy(), name='ddf', npartitions=nparts)
# merge the geopandas and dask dataframes and then drop join col
joined = (dd.merge(ddf, right,
on='tmp_key',
suffixes=('_from', '_to'),
npartitions=nparts)
.drop('tmp_key', axis=1)) # <- is it more performant to skip this step?
# will this speed things up?
joined.set_index('id_from')
def calc(grouped_df):
gl = wkb_loads(grouped_df.iloc[0].geometry_from)
gr_list = list(map(wkb_loads, grouped_df.geometry_to.values))
grs = gpd.GeoSeries(gr_list)
return np.array(grs.distance(gl).values)
distances = (joined
.groupby('id_from')
.apply(calc, meta=pd.Series()))
print('\nRunning compute...')
res = distances.compute()
print('Finished compute. Now trying to determine size...')
size = sys.getsizeof(res)/float(1024)
print('\nSize is {} MB.'.format(size))
print('\nNow transforming to dict...')
dictified = res.to_dict()
print('Finished conversion.')
first_key = list(dictified.keys())[0]
print('\nDone! Example from key {}: '.format(first_key), dictified[first_key])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment