Skip to content

Instantly share code, notes, and snippets.

@jiayuasu
Last active June 6, 2022 01:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jiayuasu/6007073a52e196e79ff47d3f219e5ef1 to your computer and use it in GitHub Desktop.
Save jiayuasu/6007073a52e196e79ff47d3f219e5ef1 to your computer and use it in GitHub Desktop.
val inputFile = "/Users/jiayu/Downloads/Apache_Sedona_Wherobots/gemeinde_de.geojson"
val featuresFile = "/Users/jiayu/Downloads/Apache_Sedona_Wherobots/gemeinde_de_features.geojson"
val geoJson = sparkSession.read.json(inputFile)
geoJson.printSchema()
// Sedona requires that GEOJSON schema has to be Feature, not FeatureCollection
// because a FeatureCollection GeoJSON does NOT follow one geometry per record
// See https://sedona.apache.org/tutorial/rdd/#create-a-generic-spatialrdd
// So we need to separate meta data and features
val metaData = geoJson.select("crs", "source", "type")
metaData.show()
// Only select 3 elements required by the GEOJSON features, including geometry, properties, and type
var features_json = geoJson.select(explode(col("features")).as("feature"))
.select("feature.geometry", "feature.properties", "feature.type")
features_json.printSchema()
// In the raw data file, Polygon data mistakenly follows Struct(ArrayType(ArrayType(ArrayType("X", "Y"))), StringType) format
// This violates GEOJSON specification. We have to cast it to Struct(ArrayType(ArrayType(ArrayType(DoubleType))), StringType)
// See GEOJSON Specification: https://geojson.org/geojson-spec.html#id4
// In the raw data file, MultiPolygon data mistakenly follows Struct(ArrayType(ArrayType(ArrayType("[X, Y]"))), StringType) format
// This also violates GEOJSON specification. This requires some code to fix. I will skip it this time.
// See GEOJSON Specification: https://geojson.org/geojson-spec.html#id7
val wantedSchema = StructType(
Seq(
StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType))), true),
StructField("type", StringType)
)
)
features_json = features_json.filter("geometry.type = 'Polygon'")
.select(col("geometry").cast(wantedSchema), struct("properties.ADE", "properties.AGS").as("properties"), col("type"))
// Sedona GeoJSON reader only reads 1 more level in the nested column. You need to manually select the column you want to use
features_json.printSchema()
// Store the cleaned data back to a valid GeoJSON format supported by Sedona
features_json.write.mode(SaveMode.Overwrite).json(featuresFile)
// Now load GEOJSON using Sedona GeoJSON reader
val allowTopologyInvalidGeometries = true // Optional
val skipSyntaxInvalidGeometries = false // Optional
val spatialRDD = GeoJsonReader.readToGeometryRDD(sparkSession.sparkContext, featuresFile, allowTopologyInvalidGeometries, skipSyntaxInvalidGeometries)
val spatialDf = Adapter.toDf(spatialRDD, sparkSession)
spatialDf.show()
println(spatialDf.count()) // 10635 polygons are loaded, out of 11354 records. The missing data is MultiPolygons.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment