Skip to content

Instantly share code, notes, and snippets.

@jwass

jwass/README.md Secret

Last active December 6, 2024 16:59
Show Gist options
  • Save jwass/8e9b6c16902a05ae66b9688f1a5bb4ff to your computer and use it in GitHub Desktop.
Save jwass/8e9b6c16902a05ae66b9688f1a5bb4ff to your computer and use it in GitHub Desktop.
Compute a balanced kdtree

I used the Overture places data to run this - it's ~4.6GB and 55M rows

$ aws --no-sign-request s3 sync s3://overturemaps-us-west-2/release/2024-11-13.0/theme=places/type=place/ .
$ duckdb
D .read kdtree.sql
-- Run this at the end to generate a places file with the partition_id added
LOAD spatial;
COPY (
SELECT
places.*,
partitions.partition_id
FROM read_parquet('part-0000*.parquet') places
JOIN read_parquet('partitions.parquet') partitions
ON places.id = partitions.id
WHERE
partitions.partition_id < '0001100110'
ORDER BY partitions.partition_id
)
TO 'places-with-partition-id-00000.parquet' (FORMAT PARQUET);
LOAD spatial;
PRAGMA temp_directory='tmp';
-- Compute an approximately balanced quadtree
COPY(
WITH RECURSIVE kdtree(iteration, id, x, y, partition_id) AS (
SELECT
0 AS iteration,
id,
ST_X(geometry) AS x,
ST_Y(geometry) AS y,
'0' AS partition_id
FROM read_parquet('part-*.parquet')
UNION ALL
SELECT
iteration + 1 AS iteration,
id,
x,
y,
IF(
IF(MOD(iteration, 2) = 0, x, y) < APPROX_QUANTILE(
IF(MOD(iteration, 2) = 0, x, y),
0.5
) OVER (
PARTITION BY partition_id
),
partition_id || '0',
partition_id || '1'
) AS partition_id
FROM kdtree
WHERE
iteration < 9
)
SELECT
id,
x,
y,
partition_id
FROM kdtree
WHERE
iteration = 9
)
TO 'partitions.parquet' (FORMAT PARQUET);
-- Write a GeoJSON file with geometries being the bounding boxes of each
-- partition group from the previous part. Include the row count in each group
COPY(
SELECT
partition_id,
n,
ST_MAKEENVELOPE(xmin, ymin, xmax, ymax)
FROM (
SELECT
partition_id,
COUNT(1) AS n,
MIN(x) as xmin,
MIN(y) as ymin,
MAX(x) as xmax,
MAX(y) as ymax
FROM read_parquet('partitions.parquet')
GROUP BY partition_id
)
)
TO 'partition_boxes.geojson'
WITH (FORMAT GDAL, DRIVER 'GeoJSON');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment