I used the Overture places data to run this - it's ~4.6GB and 55M rows
$ aws --no-sign-request s3 sync s3://overturemaps-us-west-2/release/2024-11-13.0/theme=places/type=place/ .
$ duckdb
D .read kdtree.sql
| -- Run this at the end to generate a places file with the partition_id added | |
| LOAD spatial; | |
| COPY ( | |
| SELECT | |
| places.*, | |
| partitions.partition_id | |
| FROM read_parquet('part-0000*.parquet') places | |
| JOIN read_parquet('partitions.parquet') partitions | |
| ON places.id = partitions.id | |
| WHERE | |
| partitions.partition_id < '0001100110' | |
| ORDER BY partitions.partition_id | |
| ) | |
| TO 'places-with-partition-id-00000.parquet' (FORMAT PARQUET); |
| LOAD spatial; | |
| PRAGMA temp_directory='tmp'; | |
| -- Compute an approximately balanced quadtree | |
| COPY( | |
| WITH RECURSIVE kdtree(iteration, id, x, y, partition_id) AS ( | |
| SELECT | |
| 0 AS iteration, | |
| id, | |
| ST_X(geometry) AS x, | |
| ST_Y(geometry) AS y, | |
| '0' AS partition_id | |
| FROM read_parquet('part-*.parquet') | |
| UNION ALL | |
| SELECT | |
| iteration + 1 AS iteration, | |
| id, | |
| x, | |
| y, | |
| IF( | |
| IF(MOD(iteration, 2) = 0, x, y) < APPROX_QUANTILE( | |
| IF(MOD(iteration, 2) = 0, x, y), | |
| 0.5 | |
| ) OVER ( | |
| PARTITION BY partition_id | |
| ), | |
| partition_id || '0', | |
| partition_id || '1' | |
| ) AS partition_id | |
| FROM kdtree | |
| WHERE | |
| iteration < 9 | |
| ) | |
| SELECT | |
| id, | |
| x, | |
| y, | |
| partition_id | |
| FROM kdtree | |
| WHERE | |
| iteration = 9 | |
| ) | |
| TO 'partitions.parquet' (FORMAT PARQUET); | |
| -- Write a GeoJSON file with geometries being the bounding boxes of each | |
| -- partition group from the previous part. Include the row count in each group | |
| COPY( | |
| SELECT | |
| partition_id, | |
| n, | |
| ST_MAKEENVELOPE(xmin, ymin, xmax, ymax) | |
| FROM ( | |
| SELECT | |
| partition_id, | |
| COUNT(1) AS n, | |
| MIN(x) as xmin, | |
| MIN(y) as ymin, | |
| MAX(x) as xmax, | |
| MAX(y) as ymax | |
| FROM read_parquet('partitions.parquet') | |
| GROUP BY partition_id | |
| ) | |
| ) | |
| TO 'partition_boxes.geojson' | |
| WITH (FORMAT GDAL, DRIVER 'GeoJSON'); |