Created
April 27, 2022 17:17
-
-
Save nwstephens/9e5e90601a889b8eb538a6ba9d8578f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"id": "d625d29a-ceb3-4711-8b13-5365ad017a15", | |
"metadata": {}, | |
"source": [ | |
"# Spatial Join\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"id": "03d25a2f-2028-4544-af03-9cad328438df", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import os\n", | |
"import time\n", | |
"import cupy\n", | |
"import cudf\n", | |
"import cuspatial\n", | |
"import numpy as np" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "00f7b5f8-8c42-4c13-a396-999167231b5d", | |
"metadata": {}, | |
"source": [ | |
"## Data\n", | |
"\n", | |
"* All `.cny` files can be downloaded from http://geoteci.engr.ccny.cuny.edu/nyctaxidata/\n", | |
"* The binary files are `int32` type in the unit of feet using projection epsg 2236 (for NYC and Long Island) each record has 4 fields (pick_up_lon, pick_up_lat,drop_off_lon, drop_off_lat). Only the first 2 fields are used the orginal 2009 yellow cab data can be accessed from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page.\n", | |
"* The taxi zone data in ESRI shapefile format can be accessed from https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page which is linked to https://s3.amazonaws.com/nyc-tlc/misc/taxi_zones.zip; the shapefile also uses EPSG 2263 projection.\n", | |
"* The NYC Community District Data (nycd) and 2000 Census Tract data (nyct2000) can be downloaded from https://www1.nyc.gov/site/planning/data-maps/open-data.page#district_political version 11a released in 2011 were used; both use EPSG 2263 projection." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"id": "2b0351d2-0376-43d5-b131-5fb042c64003", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"reading points file: data/200901.cny\n", | |
"reading points file: data/200902.cny\n", | |
"reading points file: data/200903.cny\n", | |
"reading points file: data/200904.cny\n", | |
"reading points file: data/200905.cny\n", | |
"reading points file: data/200906.cny\n", | |
"reading points file: data/200907.cny\n", | |
"reading points file: data/200908.cny\n", | |
"reading points file: data/200909.cny\n", | |
"reading points file: data/200910.cny\n", | |
"reading points file: data/200911.cny\n", | |
"95879703\n" | |
] | |
} | |
], | |
"source": [ | |
"cuspatial_data_path='data' \n", | |
"polygon_shapefile_name='taxi_zones.shp'\n", | |
"#polygon_shapefile_name='nycd_11a_av/nycd.shp'\n", | |
"#polygon_shapefile_name='nyct2000_11a_av/nyct2000.shp'\n", | |
"\n", | |
"def read_points(path):\n", | |
" print('reading points file:', path)\n", | |
" points = np.fromfile(path, dtype=np.int32)\n", | |
" points = cupy.asarray(points)\n", | |
" points = points.reshape((len(points)// 4, 4))\n", | |
" points = cudf.DataFrame(points)\n", | |
" return points\n", | |
"\n", | |
"#use 6 month data for RTX 2080 with 8GB memory \n", | |
"points_df = cudf.concat(\n", | |
" [read_points(os.path.join(cuspatial_data_path,\n", | |
" '2009{}.cny'.format('0{}'.format(i) if i < 10 else i)))\n", | |
" for i in range(1, 12)]).reset_index(drop=True)\n", | |
"points_df['x'] = points_df[0].astype(np.float32)\n", | |
"points_df['y'] = points_df[1].astype(np.float32)\n", | |
"print(len(points_df))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "e68b3969-0aaa-437f-a20f-acb406640333", | |
"metadata": {}, | |
"source": [ | |
"## Quadtree construction " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"id": "f4bf82c0-2d2d-4aa4-8353-933644581048", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"number of nodes: 560120\n", | |
"node count: 140049\n", | |
"leaf count: 420071\n", | |
"quadtree construction time : 61.79094314575195\n" | |
] | |
} | |
], | |
"source": [ | |
"ply_fpos, ply_rpos, ply_vertices = cuspatial.read_polygon_shapefile(\n", | |
" os.path.join(cuspatial_data_path , polygon_shapefile_name)) \n", | |
"x1,x2,y1,y2 =(ply_vertices['x'].min(), ply_vertices['x'].max(), \n", | |
" ply_vertices['y'].min(), ply_vertices['y'].max())\n", | |
" \n", | |
"num_levels = 15\n", | |
"min_size = 512\n", | |
"scale = max(abs(x2 - x1), abs(y2 - y1)) / ((1 << num_levels) - 2);\n", | |
"\n", | |
"start = time.time()\n", | |
"key_to_point,quadtree = cuspatial.quadtree_on_points(points_df['x'],points_df['y'],\n", | |
" x1,x2,y1,y2, scale,num_levels, min_size)\n", | |
"end = time.time()\n", | |
"\n", | |
"print('number of nodes:',len(quadtree))\n", | |
"print('node count:', (quadtree['is_quad'] == True).sum())\n", | |
"print('leaf count:', (quadtree['is_quad'] == False).sum())\n", | |
"print('quadtree construction time :', (end-start)*1000)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "00d23f3f-2e73-4376-ad17-5213cd9c8796", | |
"metadata": {}, | |
"source": [ | |
"## Spatial filtering" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"id": "d41dc6fb-62d4-4ba8-b392-56b550b4a172", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"263\n", | |
"number of polygon-quadrant pairs: 933665\n", | |
"spatial filtering time : 10.741949081420898\n" | |
] | |
} | |
], | |
"source": [ | |
"poly_bboxes = cuspatial.polygon_bounding_boxes(\n", | |
" ply_fpos, ply_rpos, ply_vertices['x'], ply_vertices['y'])\n", | |
"print(len(poly_bboxes)) \n", | |
"\n", | |
"start = time.time()\n", | |
"intersections = cuspatial.join_quadtree_and_bounding_boxes(\n", | |
" quadtree, poly_bboxes,x1,x2,y1,y2, scale, num_levels,\n", | |
")\n", | |
"end = time.time()\n", | |
"print('number of polygon-quadrant pairs:',len(intersections)) \n", | |
"print('spatial filtering time :', (end-start)*1000)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"id": "7633cba0-a4f9-4fc4-b0d3-fe60a43df8e2", | |
"metadata": {}, | |
"source": [ | |
"## Spatial refinement" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"id": "4259637f-d30c-4163-b53f-6ad6f355ac2c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"95238407\n", | |
"spatial refinement time : 508.0451965332031\n" | |
] | |
} | |
], | |
"source": [ | |
"start = time.time()\n", | |
"polygons_and_points = cuspatial.quadtree_point_in_polygon(\n", | |
" intersections, quadtree,key_to_point,\n", | |
" points_df['x'],points_df['y'],\n", | |
" ply_fpos,ply_rpos,ply_vertices['x'],ply_vertices['y'])\n", | |
"end = time.time()\n", | |
"print(len(polygons_and_points)) \n", | |
"print('spatial refinement time :', (end-start)*1000)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.8.13" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment