Created
December 17, 2021 14:03
-
-
Save MaherBhasvar/79f70e284330f38b0d8e940ec9533755 to your computer and use it in GitHub Desktop.
UDF i.e. User Defined Functions is a very helpful API from Spark SQL. It acts as a column based function. One of the important case I used it, is for converting complex data structure like arrays and structs to String, the reason being, I wanted to save the data in a csv, and saving in csv wont go through with these data structures.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def main (args: Array[String]) : Unit = { | |
val spark = SparkSession | |
.builder | |
.appName("Temp") | |
.config("spark.master", "local") | |
.getOrCreate | |
val stringifyArray = udf(( vs: Seq[String]) => vs match { | |
case null => null | |
case _ => s"${vs.mkString(" ")}" | |
}) | |
val nums :Seq[Row] = Seq( | |
//first row | |
Row( | |
//Contains only one column | |
Array("Cool","As","Cucumber") | |
), | |
//second row | |
Row( | |
//Contains only one column as previous row | |
Array("Dead","As","Doornail") | |
) | |
) | |
val rows = spark.sparkContext.parallelize(nums) | |
val df = spark.createDataFrame( | |
rows, | |
//schema: | |
StructType( | |
List( | |
StructField("nums", ArrayType(StringType, false), false) | |
) | |
) | |
) | |
df.show() | |
//output: | |
//+--------------------+ | |
//| nums| | |
//+--------------------+ | |
//|[Cool, As, Cucumber]| | |
//|[Dead, As, Doornail]| | |
//+--------------------+ | |
val updatedDf = df.withColumn("stringified", stringifyArray(col("nums"))) | |
.drop(col("nums")) | |
updatedDf.show | |
//output: | |
//+----------------+ | |
//| stringified| | |
//+----------------+ | |
//|Cool As Cucumber| | |
//|Dead As Doornail| | |
//+----------------+ | |
updatedDf | |
.coalesce(1) | |
.write.mode(SaveMode.Overwrite) | |
.csv(" your path ") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment