Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Youssef-Harby/3e676fd564fc671c90c2ba8cb7438b69 to your computer and use it in GitHub Desktop.
Save Youssef-Harby/3e676fd564fc671c90c2ba8cb7438b69 to your computer and use it in GitHub Desktop.
GeoParquet 1.1 metadata example
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
import json
import pandas as pd
def process_parquet_file(parquet_path):
# Read the Parquet file into a PyArrow Table
table = pq.read_table(parquet_path)
# Convert to Pandas DataFrame for manipulation if needed
df = table.to_pandas()
# Assuming longitude and latitude columns exist and are correct
# Calculate WKB for geometry using shapely if it's more complex than a point
from shapely.geometry import Point
import geopandas as gpd
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkb)
# Update DataFrame in pyarrow for further processing
table = pa.Table.from_pandas(gdf, preserve_index=False)
# Define GeoParquet metadata
metadata = {
"geo": {
"version": "1.1.0-dev",
"primary_column": "geometry",
"columns": {
"geometry": {
"encoding": "WKB",
"geometry_types": ["Point"],
"crs": {
"$schema": "https://proj.org/schemas/v0.6/projjson.schema.json",
"area": "World.",
"bbox": {
"east_longitude": 180,
"north_latitude": 90,
"south_latitude": -90,
"west_longitude": -180,
},
"coordinate_system": {
"axis": [
{
"abbreviation": "Lon",
"direction": "east",
"name": "Geodetic longitude",
"unit": "degree",
},
{
"abbreviation": "Lat",
"direction": "north",
"name": "Geodetic latitude",
"unit": "degree",
},
],
"subtype": "ellipsoidal",
},
"datum_ensemble": {
"accuracy": "2.0",
"ellipsoid": {
"inverse_flattening": 298.257223563,
"name": "WGS 84",
"semi_major_axis": 6378137,
},
"id": {"authority": "EPSG", "code": 6326},
"members": [
{
"id": {"authority": "EPSG", "code": 1166},
"name": "World Geodetic System 1984 (Transit)",
},
{
"id": {"authority": "EPSG", "code": 1152},
"name": "World Geodetic System 1984 (G730)",
},
{
"id": {"authority": "EPSG", "code": 1153},
"name": "World Geodetic System 1984 (G873)",
},
{
"id": {"authority": "EPSG", "code": 1154},
"name": "World Geodetic System 1984 (G1150)",
},
{
"id": {"authority": "EPSG", "code": 1155},
"name": "World Geodetic System 1984 (G1674)",
},
{
"id": {"authority": "EPSG", "code": 1156},
"name": "World Geodetic System 1984 (G1762)",
},
{
"id": {"authority": "EPSG", "code": 1309},
"name": "World Geodetic System 1984 (G2139)",
},
],
"name": "World Geodetic System 1984 ensemble",
},
"id": {"authority": "OGC", "code": "CRS84"},
"name": "WGS 84 (CRS84)",
"scope": "Not known.",
"type": "GeographicCRS",
},
# Assume calculating bbox if needed or use static
"bbox": [
df.longitude.min(),
df.latitude.min(),
df.longitude.max(),
df.latitude.max(),
],
}
},
}
}
# Update the metadata of the table schema
new_metadata = {
**table.schema.metadata,
b"geo": json.dumps(metadata["geo"]).encode("utf-8"),
}
new_schema = table.schema.with_metadata(new_metadata)
new_table = table.cast(new_schema)
# Write the modified table back to Parquet
output_path = str(parquet_path).replace(".parquet", "_modified.parquet")
pq.write_table(new_table, output_path)
print(f"Processed and saved modified file: {output_path}")
# Base directory containing the Parquet files
base_directory = Path("./output_directory")
# Process each Parquet file in the directory and its subdirectories
for parquet_file in base_directory.rglob("*.parquet"):
print(f"Processing file: {parquet_file}")
process_parquet_file(parquet_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment