Youssef-Harby/geoparquet_metadata_point_example.py

## geoparquet_metadata_point_example.py
import pyarrow as pa
import pyarrow.parquet as pq
from pathlib import Path
import json
import pandas as pd


def process_parquet_file(parquet_path):
    # Read the Parquet file into a PyArrow Table
    table = pq.read_table(parquet_path)

    # Convert to Pandas DataFrame for manipulation if needed
    df = table.to_pandas()

    # Assuming longitude and latitude columns exist and are correct
    # Calculate WKB for geometry using shapely if it's more complex than a point
    from shapely.geometry import Point
    import geopandas as gpd

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
    gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkb)

    # Update DataFrame in pyarrow for further processing
    table = pa.Table.from_pandas(gdf, preserve_index=False)

    # Define GeoParquet metadata
    metadata = {
        "geo": {
            "version": "1.1.0-dev",
            "primary_column": "geometry",
            "columns": {
                "geometry": {
                    "encoding": "WKB",
                    "geometry_types": ["Point"],
                    "crs": {
                        "$schema": "https://proj.org/schemas/v0.6/projjson.schema.json",
                        "area": "World.",
                        "bbox": {
                            "east_longitude": 180,
                            "north_latitude": 90,
                            "south_latitude": -90,
                            "west_longitude": -180,
                        },
                        "coordinate_system": {
                            "axis": [
                                {
                                    "abbreviation": "Lon",
                                    "direction": "east",
                                    "name": "Geodetic longitude",
                                    "unit": "degree",
                                },
                                {
                                    "abbreviation": "Lat",
                                    "direction": "north",
                                    "name": "Geodetic latitude",
                                    "unit": "degree",
                                },
                            ],
                            "subtype": "ellipsoidal",
                        },
                        "datum_ensemble": {
                            "accuracy": "2.0",
                            "ellipsoid": {
                                "inverse_flattening": 298.257223563,
                                "name": "WGS 84",
                                "semi_major_axis": 6378137,
                            },
                            "id": {"authority": "EPSG", "code": 6326},
                            "members": [
                                {
                                    "id": {"authority": "EPSG", "code": 1166},
                                    "name": "World Geodetic System 1984 (Transit)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1152},
                                    "name": "World Geodetic System 1984 (G730)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1153},
                                    "name": "World Geodetic System 1984 (G873)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1154},
                                    "name": "World Geodetic System 1984 (G1150)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1155},
                                    "name": "World Geodetic System 1984 (G1674)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1156},
                                    "name": "World Geodetic System 1984 (G1762)",
                                },
                                {
                                    "id": {"authority": "EPSG", "code": 1309},
                                    "name": "World Geodetic System 1984 (G2139)",
                                },
                            ],
                            "name": "World Geodetic System 1984 ensemble",
                        },
                        "id": {"authority": "OGC", "code": "CRS84"},
                        "name": "WGS 84 (CRS84)",
                        "scope": "Not known.",
                        "type": "GeographicCRS",
                    },
                    # Assume calculating bbox if needed or use static
                    "bbox": [
                        df.longitude.min(),
                        df.latitude.min(),
                        df.longitude.max(),
                        df.latitude.max(),
                    ],
                }
            },
        }
    }

    # Update the metadata of the table schema
    new_metadata = {
        **table.schema.metadata,
        b"geo": json.dumps(metadata["geo"]).encode("utf-8"),
    }
    new_schema = table.schema.with_metadata(new_metadata)
    new_table = table.cast(new_schema)

    # Write the modified table back to Parquet
    output_path = str(parquet_path).replace(".parquet", "_modified.parquet")
    pq.write_table(new_table, output_path)

    print(f"Processed and saved modified file: {output_path}")


# Base directory containing the Parquet files
base_directory = Path("./output_directory")

# Process each Parquet file in the directory and its subdirectories
for parquet_file in base_directory.rglob("*.parquet"):
    print(f"Processing file: {parquet_file}")
    process_parquet_file(parquet_file)
	import pyarrow as pa
	import pyarrow.parquet as pq
	from pathlib import Path
	import json
	import pandas as pd


	def process_parquet_file(parquet_path):
	# Read the Parquet file into a PyArrow Table
	table = pq.read_table(parquet_path)

	# Convert to Pandas DataFrame for manipulation if needed
	df = table.to_pandas()

	# Assuming longitude and latitude columns exist and are correct
	# Calculate WKB for geometry using shapely if it's more complex than a point
	from shapely.geometry import Point
	import geopandas as gpd

	gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
	gdf["geometry"] = gdf["geometry"].apply(lambda geom: geom.wkb)

	# Update DataFrame in pyarrow for further processing
	table = pa.Table.from_pandas(gdf, preserve_index=False)

	# Define GeoParquet metadata
	metadata = {
	"geo": {
	"version": "1.1.0-dev",
	"primary_column": "geometry",
	"columns": {
	"geometry": {
	"encoding": "WKB",
	"geometry_types": ["Point"],
	"crs": {
	"$schema": "https://proj.org/schemas/v0.6/projjson.schema.json",
	"area": "World.",
	"bbox": {
	"east_longitude": 180,
	"north_latitude": 90,
	"south_latitude": -90,
	"west_longitude": -180,
	},
	"coordinate_system": {
	"axis": [
	{
	"abbreviation": "Lon",
	"direction": "east",
	"name": "Geodetic longitude",
	"unit": "degree",
	},
	{
	"abbreviation": "Lat",
	"direction": "north",
	"name": "Geodetic latitude",
	"unit": "degree",
	},
	],
	"subtype": "ellipsoidal",
	},
	"datum_ensemble": {
	"accuracy": "2.0",
	"ellipsoid": {
	"inverse_flattening": 298.257223563,
	"name": "WGS 84",
	"semi_major_axis": 6378137,
	},
	"id": {"authority": "EPSG", "code": 6326},
	"members": [
	{
	"id": {"authority": "EPSG", "code": 1166},
	"name": "World Geodetic System 1984 (Transit)",
	},
	{
	"id": {"authority": "EPSG", "code": 1152},
	"name": "World Geodetic System 1984 (G730)",
	},
	{
	"id": {"authority": "EPSG", "code": 1153},
	"name": "World Geodetic System 1984 (G873)",
	},
	{
	"id": {"authority": "EPSG", "code": 1154},
	"name": "World Geodetic System 1984 (G1150)",
	},
	{
	"id": {"authority": "EPSG", "code": 1155},
	"name": "World Geodetic System 1984 (G1674)",
	},
	{
	"id": {"authority": "EPSG", "code": 1156},
	"name": "World Geodetic System 1984 (G1762)",
	},
	{
	"id": {"authority": "EPSG", "code": 1309},
	"name": "World Geodetic System 1984 (G2139)",
	},
	],
	"name": "World Geodetic System 1984 ensemble",
	},
	"id": {"authority": "OGC", "code": "CRS84"},
	"name": "WGS 84 (CRS84)",
	"scope": "Not known.",
	"type": "GeographicCRS",
	},
	# Assume calculating bbox if needed or use static
	"bbox": [
	df.longitude.min(),
	df.latitude.min(),
	df.longitude.max(),
	df.latitude.max(),
	],
	}
	},
	}
	}

	# Update the metadata of the table schema
	new_metadata = {
	**table.schema.metadata,
	b"geo": json.dumps(metadata["geo"]).encode("utf-8"),
	}
	new_schema = table.schema.with_metadata(new_metadata)
	new_table = table.cast(new_schema)

	# Write the modified table back to Parquet
	output_path = str(parquet_path).replace(".parquet", "_modified.parquet")
	pq.write_table(new_table, output_path)

	print(f"Processed and saved modified file: {output_path}")


	# Base directory containing the Parquet files
	base_directory = Path("./output_directory")

	# Process each Parquet file in the directory and its subdirectories
	for parquet_file in base_directory.rglob("*.parquet"):
	print(f"Processing file: {parquet_file}")
	process_parquet_file(parquet_file)