Skip to content

Instantly share code, notes, and snippets.

@FernandoBontorin
Created February 27, 2020 01:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FernandoBontorin/37296bc670a808a38a022f8a43264e1f to your computer and use it in GitHub Desktop.
Save FernandoBontorin/37296bc670a808a38a022f8a43264e1f to your computer and use it in GitHub Desktop.
Nulleable
import org.apache.spark.sql.functions.{col, max}
import org.apache.spark.sql.{DataFrame, Row}
import scala.reflect.ClassTag
object SparkUtils {
def hasNull(df: DataFrame, cols: String*): Boolean = {
hasNull(asArrayOf[Boolean](isNull(df, cols: _*).collect))
}
def hasNull(b: Array[Boolean]): Boolean = {
b.exists(p => p)
}
def nullColumnNames(df: DataFrame, cols: String*): Array[String] = {
val dfNulls = isNull(df, cols: _*)
(dfNulls.columns zip asArrayOf[Boolean](dfNulls.collect)).toMap
.filter(_._2 == true)
.keySet
.toArray
}
def asArrayOf[A: ClassTag](rows: Array[Row]): Array[A] = {
rows.head.toSeq.asInstanceOf[Seq[A]].toArray
}
def isNull(df: DataFrame, cols: String*): DataFrame = {
if (cols.isEmpty) {
df.select(df.columns.map(c => max(col(c).isNull).alias(c)): _*)
} else {
df.select(cols.map(c => max(col(c).isNull).alias(c)): _*)
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment