Skip to content

Instantly share code, notes, and snippets.

@stdatalabs
Last active January 23, 2019 10:30
Show Gist options
  • Save stdatalabs/aaf286a9155c6e0c60d70dd1751155d8 to your computer and use it in GitHub Desktop.
Save stdatalabs/aaf286a9155c6e0c60d70dd1751155d8 to your computer and use it in GitHub Desktop.
A Spark UDF to find the MD5 message digest of a column. More @ stdatalabs.blogspot.com
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.hive.HiveContext
val hiveContext = new HiveContext(sc)
import hiveContext.implicits._
import hiveContext.sql
import sqlContext.implicits._
import java.security.MessageDigest
/**
* A Spark UDF to find the MD5 message digest of a column
*
* More discussion at stdatalabs.blogspot.com
*
* @author Sachin Thirumala
*/
// Define schema for custom_info table
case class customer_info(name: String, dob: String, address: String, city: String )
// Populate dummy data
val details = sc.parallelize(Array(
customer_info("Sachin", "10-10-1972", "#24, Malad", "Mumbai"),
customer_info("Sourav", "31-09-1973", "#41, ultadanga", "Kolkata"),
customer_info("Sehwag", "23-10-1981", "#23, Dwaraka", "Delhi"),
customer_info("Rahul", "31-12-1971", "#41, Whitefield", "Bangalore")
))
// Create DataFrame and register as temp table
val customerDF = sqlContext.createDataFrame(details)
customerDF.registerTempTable("customer_info")
// Function: dbms_crypto
def dbms_crypto(s:String) : String = {
// Create md5 of the string
val digest = MessageDigest.getInstance("MD5")
val md5hash = digest.digest(s.getBytes).map(0xFF & _).map { "%02x".format(_) }.foldLeft(""){_ + _}
return md5hash.map(_.toUpper)
}
// Register the function as a UDF
sqlContext.udf.register("dbms_crypto",dbms_crypto _)
sqlContext.sql("select dbms_crypto(CONCAT(name,dob,address)) hash_key from customer_info").show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment