Skip to content

Instantly share code, notes, and snippets.

@kylebarron
Created February 25, 2022 20:14
Show Gist options
  • Save kylebarron/2a8f3882607f1033c28571369348dd3a to your computer and use it in GitHub Desktop.
Save kylebarron/2a8f3882607f1033c28571369348dd3a to your computer and use it in GitHub Desktop.
Explore partitioning geometries into arrow row groups with bbox metadata info
import json
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import numpy as np
def main():
# Create random point data within bbox
size = 1_000_000
us_bbox = [-126.39, 25.99, -65.7, 49.15]
minx, miny, maxx, maxy = us_bbox
lons = np.random.uniform(low=minx, high=maxx, size=size)
lats = np.random.uniform(low=miny, high=maxy, size=size)
# Create rtree boxes. This is just a quick hack
boxes = get_boxes(us_bbox, cols=3, rows=3)
# Partition input data into boxes
partitions = partition_data(lons, lats, boxes)
arrow_groups = []
for partition, box in zip(partitions, boxes):
arrow_groups.append(create_arrow_group(partition, box))
bbox_mapping = {idx: ",".join(map(str, box)) for idx, box in enumerate(boxes)}
file_schema = arrow_groups[0].schema.with_metadata(
{"partitions": json.dumps(bbox_mapping)}
)
with pq.ParquetWriter("test.parquet", file_schema) as writer:
for group in arrow_groups:
writer.write_table(group)
def get_boxes(us_bbox, cols=3, rows=3):
minx, miny, maxx, maxy = us_bbox
xsize = (maxx - minx) / cols
ysize = (maxy - miny) / rows
boxes = []
for i in range(cols):
for j in range(rows):
boxes.append(
[
minx + (i * xsize),
miny + (j * ysize),
minx + ((i + 1) * xsize),
miny + ((j + 1) * ysize),
]
)
return boxes
def partition_data(lons, lats, boxes):
xy = np.vstack((lons, lats)).T
partitions = []
for box in boxes:
minx, maxx, miny, maxy = box
indices = np.where(
(lons > minx) & (lons < maxx) & (lats > miny) & (lats < maxy)
)[0]
partitions.append(xy[indices])
return partitions
def create_arrow_group(partition, box):
xs = partition[:, 0]
ys = partition[:, 1]
xs_array = pa.array(xs)
ys_array = pa.array(ys)
meta = {"bbox": ",".join(map(str, box))}
table = pa.Table.from_arrays(
(xs_array, ys_array), names=["lon", "lat"], metadata=meta
)
return table
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment