Created
February 25, 2022 20:14
-
-
Save kylebarron/2a8f3882607f1033c28571369348dd3a to your computer and use it in GitHub Desktop.
Explore partitioning geometries into arrow row groups with bbox metadata info
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
import pandas as pd | |
import numpy as np | |
def main(): | |
# Create random point data within bbox | |
size = 1_000_000 | |
us_bbox = [-126.39, 25.99, -65.7, 49.15] | |
minx, miny, maxx, maxy = us_bbox | |
lons = np.random.uniform(low=minx, high=maxx, size=size) | |
lats = np.random.uniform(low=miny, high=maxy, size=size) | |
# Create rtree boxes. This is just a quick hack | |
boxes = get_boxes(us_bbox, cols=3, rows=3) | |
# Partition input data into boxes | |
partitions = partition_data(lons, lats, boxes) | |
arrow_groups = [] | |
for partition, box in zip(partitions, boxes): | |
arrow_groups.append(create_arrow_group(partition, box)) | |
bbox_mapping = {idx: ",".join(map(str, box)) for idx, box in enumerate(boxes)} | |
file_schema = arrow_groups[0].schema.with_metadata( | |
{"partitions": json.dumps(bbox_mapping)} | |
) | |
with pq.ParquetWriter("test.parquet", file_schema) as writer: | |
for group in arrow_groups: | |
writer.write_table(group) | |
def get_boxes(us_bbox, cols=3, rows=3): | |
minx, miny, maxx, maxy = us_bbox | |
xsize = (maxx - minx) / cols | |
ysize = (maxy - miny) / rows | |
boxes = [] | |
for i in range(cols): | |
for j in range(rows): | |
boxes.append( | |
[ | |
minx + (i * xsize), | |
miny + (j * ysize), | |
minx + ((i + 1) * xsize), | |
miny + ((j + 1) * ysize), | |
] | |
) | |
return boxes | |
def partition_data(lons, lats, boxes): | |
xy = np.vstack((lons, lats)).T | |
partitions = [] | |
for box in boxes: | |
minx, maxx, miny, maxy = box | |
indices = np.where( | |
(lons > minx) & (lons < maxx) & (lats > miny) & (lats < maxy) | |
)[0] | |
partitions.append(xy[indices]) | |
return partitions | |
def create_arrow_group(partition, box): | |
xs = partition[:, 0] | |
ys = partition[:, 1] | |
xs_array = pa.array(xs) | |
ys_array = pa.array(ys) | |
meta = {"bbox": ",".join(map(str, box))} | |
table = pa.Table.from_arrays( | |
(xs_array, ys_array), names=["lon", "lat"], metadata=meta | |
) | |
return table |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment