I used the Overture places data to run this - it's ~4.6GB and 55M rows
$ aws --no-sign-request s3 sync s3://overturemaps-us-west-2/release/2024-11-13.0/theme=places/type=place/ .
$ duckdb
D .read kdtree.sql
-- Run this at the end to generate a places file with the partition_id added | |
LOAD spatial; | |
COPY ( | |
SELECT | |
places.*, | |
partitions.partition_id | |
FROM read_parquet('part-0000*.parquet') places | |
JOIN read_parquet('partitions.parquet') partitions | |
ON places.id = partitions.id | |
WHERE | |
partitions.partition_id < '0001100110' | |
ORDER BY partitions.partition_id | |
) | |
TO 'places-with-partition-id-00000.parquet' (FORMAT PARQUET); |
LOAD spatial; | |
PRAGMA temp_directory='tmp'; | |
-- Compute an approximately balanced quadtree | |
COPY( | |
WITH RECURSIVE kdtree(iteration, id, x, y, partition_id) AS ( | |
SELECT | |
0 AS iteration, | |
id, | |
ST_X(geometry) AS x, | |
ST_Y(geometry) AS y, | |
'0' AS partition_id | |
FROM read_parquet('part-*.parquet') | |
UNION ALL | |
SELECT | |
iteration + 1 AS iteration, | |
id, | |
x, | |
y, | |
IF( | |
IF(MOD(iteration, 2) = 0, x, y) < APPROX_QUANTILE( | |
IF(MOD(iteration, 2) = 0, x, y), | |
0.5 | |
) OVER ( | |
PARTITION BY partition_id | |
), | |
partition_id || '0', | |
partition_id || '1' | |
) AS partition_id | |
FROM kdtree | |
WHERE | |
iteration < 9 | |
) | |
SELECT | |
id, | |
x, | |
y, | |
partition_id | |
FROM kdtree | |
WHERE | |
iteration = 9 | |
) | |
TO 'partitions.parquet' (FORMAT PARQUET); | |
-- Write a GeoJSON file with geometries being the bounding boxes of each | |
-- partition group from the previous part. Include the row count in each group | |
COPY( | |
SELECT | |
partition_id, | |
n, | |
ST_MAKEENVELOPE(xmin, ymin, xmax, ymax) | |
FROM ( | |
SELECT | |
partition_id, | |
COUNT(1) AS n, | |
MIN(x) as xmin, | |
MIN(y) as ymin, | |
MAX(x) as xmax, | |
MAX(y) as ymax | |
FROM read_parquet('partitions.parquet') | |
GROUP BY partition_id | |
) | |
) | |
TO 'partition_boxes.geojson' | |
WITH (FORMAT GDAL, DRIVER 'GeoJSON'); |