Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save lonly197/a3d568f61628a3a341480075e3d091b2 to your computer and use it in GitHub Desktop.
Save lonly197/a3d568f61628a3a341480075e3d091b2 to your computer and use it in GitHub Desktop.
Drop duplicate columns on a dataframe in spark
import org.apache.spark.sql.DataFrame
import scala.annotation.tailrec
implicit class DataFrameOperations(df: DataFrame) {
def dropDuplicateCols(rmvDF: DataFrame): DataFrame = {
val cols = df.columns.groupBy(identity).mapValues(_.size).filter(_._2 > 1).keySet.toSeq
@tailrec
def deleteCol(df: DataFrame, cols: Seq[String]): DataFrame = {
if (cols.size == 0) df else deleteCol(df.drop(rmvDF(cols.head)), cols.tail)
}
deleteCol(df, cols)
}
}
val dupDF = rdd1.join(rdd2,"id").dropDuplicateCols(rdd1)
@robintanner
Copy link

How is identity defined?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment