Last active
June 6, 2022 01:54
-
-
Save jiayuasu/6007073a52e196e79ff47d3f219e5ef1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val inputFile = "/Users/jiayu/Downloads/Apache_Sedona_Wherobots/gemeinde_de.geojson" | |
val featuresFile = "/Users/jiayu/Downloads/Apache_Sedona_Wherobots/gemeinde_de_features.geojson" | |
val geoJson = sparkSession.read.json(inputFile) | |
geoJson.printSchema() | |
// Sedona requires that GEOJSON schema has to be Feature, not FeatureCollection | |
// because a FeatureCollection GeoJSON does NOT follow one geometry per record | |
// See https://sedona.apache.org/tutorial/rdd/#create-a-generic-spatialrdd | |
// So we need to separate meta data and features | |
val metaData = geoJson.select("crs", "source", "type") | |
metaData.show() | |
// Only select 3 elements required by the GEOJSON features, including geometry, properties, and type | |
var features_json = geoJson.select(explode(col("features")).as("feature")) | |
.select("feature.geometry", "feature.properties", "feature.type") | |
features_json.printSchema() | |
// In the raw data file, Polygon data mistakenly follows Struct(ArrayType(ArrayType(ArrayType("X", "Y"))), StringType) format | |
// This violates GEOJSON specification. We have to cast it to Struct(ArrayType(ArrayType(ArrayType(DoubleType))), StringType) | |
// See GEOJSON Specification: https://geojson.org/geojson-spec.html#id4 | |
// In the raw data file, MultiPolygon data mistakenly follows Struct(ArrayType(ArrayType(ArrayType("[X, Y]"))), StringType) format | |
// This also violates GEOJSON specification. This requires some code to fix. I will skip it this time. | |
// See GEOJSON Specification: https://geojson.org/geojson-spec.html#id7 | |
val wantedSchema = StructType( | |
Seq( | |
StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType))), true), | |
StructField("type", StringType) | |
) | |
) | |
features_json = features_json.filter("geometry.type = 'Polygon'") | |
.select(col("geometry").cast(wantedSchema), struct("properties.ADE", "properties.AGS").as("properties"), col("type")) | |
// Sedona GeoJSON reader only reads 1 more level in the nested column. You need to manually select the column you want to use | |
features_json.printSchema() | |
// Store the cleaned data back to a valid GeoJSON format supported by Sedona | |
features_json.write.mode(SaveMode.Overwrite).json(featuresFile) | |
// Now load GEOJSON using Sedona GeoJSON reader | |
val allowTopologyInvalidGeometries = true // Optional | |
val skipSyntaxInvalidGeometries = false // Optional | |
val spatialRDD = GeoJsonReader.readToGeometryRDD(sparkSession.sparkContext, featuresFile, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries) | |
val spatialDf = Adapter.toDf(spatialRDD, sparkSession) | |
spatialDf.show() | |
println(spatialDf.count()) // 10635 polygons are loaded, out of 11354 records. The missing data is MultiPolygons. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment