Skip to content

Instantly share code, notes, and snippets.

@arpadtamasi
Created December 12, 2017 17:35
Show Gist options
  • Save arpadtamasi/c51df064378b1e4f3c417e46c9212aac to your computer and use it in GitHub Desktop.
Save arpadtamasi/c51df064378b1e4f3c417e46c9212aac to your computer and use it in GitHub Desktop.
Spark schema nullability modifier
object DataFrameHelpers {
import org.apache.spark.sql.types._
implicit class NullableDataFrame(df: DataFrame) {
def setNullableStateOfColumns( nullable: Boolean, containsNull: Boolean, cn: String*) : DataFrame = {
// get schema
val names = cn.toSet
val schema = df.schema
// modify [[StructField] with name `cn`
val newSchema = schemaWithNullableStates(schema, nullable, containsNull, names)
// apply new schema
df.sqlContext.createDataFrame( df.rdd, newSchema )
}
private def schemaWithNullableStates(schema: StructType, nullable: Boolean, containsNull: Boolean, names: Set[String]): StructType =
StructType(schema.map {
case StructField( name, dataType: ArrayType, _, metadata) if names.exists(_.startsWith(s"$name.")) && dataType.elementType.isInstanceOf[StructType] =>
val innerColumns = names collect {
case n if n.startsWith(s"$name.") => n.drop(name.length + 1)
}
val newArrayType = ArrayType(
schemaWithNullableStates(dataType.elementType.asInstanceOf[StructType], nullable, containsNull, innerColumns),
dataType.containsNull
)
StructField( name, newArrayType, nullable = nullable, metadata)
case StructField(name, dataType: StructType, nullability, metadata) if names.exists(_.startsWith(s"$name.")) =>
val innerColumns = names collect {
case n if n.startsWith(s"$name.") => n.drop(name.length + 1)
}
StructField( name, schemaWithNullableStates(dataType, nullable, containsNull, innerColumns), nullability, metadata)
case StructField( name, dataType: ArrayType, _, metadata) if names.contains(name) =>
StructField( name, dataType.copy(containsNull = containsNull), nullable = nullable, metadata)
case StructField( name, dataType, _, metadata) if names.contains(name) =>
StructField( name, dataType, nullable = nullable, metadata)
case y: StructField => y
})
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment