Created
April 23, 2024 01:44
-
-
Save Youssef-Harby/3e676fd564fc671c90c2ba8cb7438b69 to your computer and use it in GitHub Desktop.
GeoParquet 1.1 metadata example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow as pa | |
import pyarrow.parquet as pq | |
from pathlib import Path | |
import json | |
import pandas as pd | |
def process_parquet_file(parquet_path): | |
# Read the Parquet file into a PyArrow Table | |
table = pq.read_table(parquet_path) | |
# Convert to Pandas DataFrame for manipulation if needed | |
df = table.to_pandas() | |
# Assuming longitude and latitude columns exist and are correct | |
# Calculate WKB for geometry using shapely if it's more complex than a point | |
from shapely.geometry import Point | |
import geopandas as gpd | |
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude)) | |
gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkb) | |
# Update DataFrame in pyarrow for further processing | |
table = pa.Table.from_pandas(gdf, preserve_index=False) | |
# Define GeoParquet metadata | |
metadata = { | |
"geo": { | |
"version": "1.1.0-dev", | |
"primary_column": "geometry", | |
"columns": { | |
"geometry": { | |
"encoding": "WKB", | |
"geometry_types": ["Point"], | |
"crs": { | |
"$schema": "https://proj.org/schemas/v0.6/projjson.schema.json", | |
"area": "World.", | |
"bbox": { | |
"east_longitude": 180, | |
"north_latitude": 90, | |
"south_latitude": -90, | |
"west_longitude": -180, | |
}, | |
"coordinate_system": { | |
"axis": [ | |
{ | |
"abbreviation": "Lon", | |
"direction": "east", | |
"name": "Geodetic longitude", | |
"unit": "degree", | |
}, | |
{ | |
"abbreviation": "Lat", | |
"direction": "north", | |
"name": "Geodetic latitude", | |
"unit": "degree", | |
}, | |
], | |
"subtype": "ellipsoidal", | |
}, | |
"datum_ensemble": { | |
"accuracy": "2.0", | |
"ellipsoid": { | |
"inverse_flattening": 298.257223563, | |
"name": "WGS 84", | |
"semi_major_axis": 6378137, | |
}, | |
"id": {"authority": "EPSG", "code": 6326}, | |
"members": [ | |
{ | |
"id": {"authority": "EPSG", "code": 1166}, | |
"name": "World Geodetic System 1984 (Transit)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1152}, | |
"name": "World Geodetic System 1984 (G730)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1153}, | |
"name": "World Geodetic System 1984 (G873)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1154}, | |
"name": "World Geodetic System 1984 (G1150)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1155}, | |
"name": "World Geodetic System 1984 (G1674)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1156}, | |
"name": "World Geodetic System 1984 (G1762)", | |
}, | |
{ | |
"id": {"authority": "EPSG", "code": 1309}, | |
"name": "World Geodetic System 1984 (G2139)", | |
}, | |
], | |
"name": "World Geodetic System 1984 ensemble", | |
}, | |
"id": {"authority": "OGC", "code": "CRS84"}, | |
"name": "WGS 84 (CRS84)", | |
"scope": "Not known.", | |
"type": "GeographicCRS", | |
}, | |
# Assume calculating bbox if needed or use static | |
"bbox": [ | |
df.longitude.min(), | |
df.latitude.min(), | |
df.longitude.max(), | |
df.latitude.max(), | |
], | |
} | |
}, | |
} | |
} | |
# Update the metadata of the table schema | |
new_metadata = { | |
**table.schema.metadata, | |
b"geo": json.dumps(metadata["geo"]).encode("utf-8"), | |
} | |
new_schema = table.schema.with_metadata(new_metadata) | |
new_table = table.cast(new_schema) | |
# Write the modified table back to Parquet | |
output_path = str(parquet_path).replace(".parquet", "_modified.parquet") | |
pq.write_table(new_table, output_path) | |
print(f"Processed and saved modified file: {output_path}") | |
# Base directory containing the Parquet files | |
base_directory = Path("./output_directory") | |
# Process each Parquet file in the directory and its subdirectories | |
for parquet_file in base_directory.rglob("*.parquet"): | |
print(f"Processing file: {parquet_file}") | |
process_parquet_file(parquet_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment