Last active
May 10, 2017 11:26
-
-
Save ehabqadah/22dbb40324070a24b10f7d2f20714916 to your computer and use it in GitHub Desktop.
Update a column of DataFrame in Spark, replacing the entire column with new values after applying certain function on old values e.g., MD5
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package iais.spark_dataset; | |
import static org.apache.spark.sql.functions.*; | |
import java.math.BigInteger; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.HashMap; | |
import java.util.Map; | |
import org.apache.log4j.Level; | |
import org.apache.log4j.Logger; | |
import org.apache.spark.sql.Dataset; | |
import org.apache.spark.sql.Row; | |
import org.apache.spark.sql.SparkSession; | |
import org.apache.spark.sql.api.java.UDF1; | |
import org.apache.spark.sql.types.DataTypes; | |
/** | |
* | |
* @author ehabqadah | |
* | |
*/ | |
public class App { | |
private static final String DATA_UCI_ADULTSET100_CSV = "data/uci_adultset100.csv"; | |
public static void main(String[] args) { | |
SparkSession sparkSession = getSparkSession(); | |
// Load the data set in DataFrame | |
Dataset<Row> df = sparkSession.read().options(getCsvOptions()) | |
.csv(DATA_UCI_ADULTSET100_CSV); | |
df.show(); | |
// Print the schema of the loaded DataFrame | |
df.printSchema(); | |
// Register the DataFrame as a temporary view | |
df.createOrReplaceTempView("uci_adultset"); | |
// Register the IntegerToStringUDF under the 'definedFunction' name | |
sparkSession.udf().register("definedFunction", | |
new IntegerToStringUDF(), DataTypes.StringType); | |
// Update the '_c0' column | |
Dataset<Row> updatedDF = df.withColumn("_c0", | |
callUDF("definedFunction", col("_c0"))); | |
// Show the result | |
updatedDF.show(); | |
} | |
private static SparkSession getSparkSession() { | |
Logger.getLogger("org").setLevel(Level.OFF); | |
Logger.getLogger("akka").setLevel(Level.OFF); | |
Logger.getRootLogger().setLevel(Level.OFF); | |
SparkSession sparkSession = SparkSession.builder() | |
.appName("Spark DataSet API") | |
.config("spark.master", "local[*]").getOrCreate(); | |
return sparkSession; | |
} | |
private static Map<String, String> getCsvOptions() { | |
final Map<String, String> options = new HashMap<String, String>(); | |
options.put("delimiter", ";"); | |
options.put("inferSchema", "true"); | |
return options; | |
} | |
/** | |
* Custom UDF | |
* | |
* @author ehabqadah | |
* | |
*/ | |
public static class IntegerToStringUDF implements UDF1<Integer, String> { | |
@Override | |
public String call(Integer oldValue) throws Exception { | |
// compute the new value | |
String oldValueStr = oldValue.toString(); | |
String hashtext = md5Hash(oldValueStr); | |
return oldValueStr.substring(0, 1) + "_" + hashtext; | |
} | |
private String md5Hash(String oldValue) throws NoSuchAlgorithmException { | |
MessageDigest md = MessageDigest.getInstance("MD5"); | |
byte[] digest = md.digest(oldValue.getBytes()); | |
BigInteger bigInt = new BigInteger(1, digest); | |
String hashtext = bigInt.toString(16); | |
return hashtext; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
/////////////////// | |
/// initial DataFrame | |
///////////// | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
|_c0| _c1| _c2| _c3|_c4| _c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12| _c13| _c14| | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
| 17| Private| 79682| 10th| 6| Never-married| Priv-house-serv|Other-relative| White| Male| 0| 0| 30|United-States|30000| | |
| 17|Self-emp-not-inc|368700| 11th| 7| Never-married| Farming-fishing| Own-child| White| Male| 0| 0| 10|United-States|40000| | |
| 17| Private|197850| 11th| 7| Never-married| Adm-clerical| Own-child|Asian-Pac-Islander|Female| 0| 0| 24|United-States|40000| | |
| 21| Private|161415|Some-college| 10| Never-married| Craft-repair| Not-in-family| White| Male| 0| 0| 50|United-States|50000| | |
| 21| Private|197050|Some-college| 10| Never-married| Other-service| Own-child| White|Female| 0| 0| 35|United-States|50000| | |
| 21| State-gov|142766|Some-college| 10| Never-married| Exec-managerial| Own-child| White| Male| 0| 0| 40|United-States|50000| | |
| 21| Private|145119|Some-college| 10| Never-married| Other-service| Own-child|Asian-Pac-Islander| Male| 0| 0| 20|United-States|50000| | |
| 21| Private|211391|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 27|United-States|50000| | |
| 22| Private|214731| 10th| 6|Married-civ-spouse|Machine-op-inspct| Wife| White|Female| 0| 0| 40|United-States|30000| | |
| 22| Private|214134| 10th| 6| Never-married| Transport-moving| Not-in-family|Amer-Indian-Eskimo| Male| 0| 0| 84|United-States|30000| | |
| 22| Private|160120| 10th| 6| Never-married| Transport-moving| Own-child|Asian-Pac-Islander| Male| 0| 0| 30|United-States|30001| | |
| 22| Private|217961|Some-college| 10| Never-married| Transport-moving| Own-child| White| Male| 0|1719| 30|United-States|50000| | |
| 23| Private|103064| Bachelors| 13| Never-married| Tech-support| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
| 23| Private|181820|Some-college| 10| Never-married|Handlers-cleaners| Own-child| White| Male| 0| 0| 20|United-States|50000| | |
| 23| Private|288771|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 30|United-States|50000| | |
| 23| Private| 38707|Some-college| 10|Married-civ-spouse| Transport-moving| Husband| White| Male| 0| 0| 60|United-States|60000| | |
| 24| Local-gov|289886| 11th| 7| Never-married| Other-service| Not-in-family|Asian-Pac-Islander| Male| 0| 0| 45|United-States|40001| | |
| 24| Private|265567| 11th| 7| Never-married| Craft-repair| Own-child| White| Male| 0| 0| 35|United-States|40002| | |
| 27| Private|156516|Some-college| 10|Married-civ-spouse| Adm-clerical| Wife| Black|Female| 0|2377| 20|United-States|50000| | |
| 27| Private|158647|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
only showing top 20 rows | |
//// | |
// DataFrame schema | |
root | |
|-- _c0: integer (nullable = true) | |
|-- _c1: string (nullable = true) | |
|-- _c2: integer (nullable = true) | |
|-- _c3: string (nullable = true) | |
|-- _c4: integer (nullable = true) | |
|-- _c5: string (nullable = true) | |
|-- _c6: string (nullable = true) | |
|-- _c7: string (nullable = true) | |
|-- _c8: string (nullable = true) | |
|-- _c9: string (nullable = true) | |
|-- _c10: integer (nullable = true) | |
|-- _c11: integer (nullable = true) | |
|-- _c12: integer (nullable = true) | |
|-- _c13: string (nullable = true) | |
|-- _c14: integer (nullable = true) | |
/////////////////////// | |
///////// Update the '_c0' | |
///////////////// | |
+--------------------+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
| _c0| _c1| _c2| _c3|_c4| _c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12| _c13| _c14| | |
+--------------------+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
|1_70efdf2ec9b0860...| Private| 79682| 10th| 6| Never-married| Priv-house-serv|Other-relative| White| Male| 0| 0| 30|United-States|30000| | |
|1_70efdf2ec9b0860...|Self-emp-not-inc|368700| 11th| 7| Never-married| Farming-fishing| Own-child| White| Male| 0| 0| 10|United-States|40000| | |
|1_70efdf2ec9b0860...| Private|197850| 11th| 7| Never-married| Adm-clerical| Own-child|Asian-Pac-Islander|Female| 0| 0| 24|United-States|40000| | |
|2_3c59dc048e88502...| Private|161415|Some-college| 10| Never-married| Craft-repair| Not-in-family| White| Male| 0| 0| 50|United-States|50000| | |
|2_3c59dc048e88502...| Private|197050|Some-college| 10| Never-married| Other-service| Own-child| White|Female| 0| 0| 35|United-States|50000| | |
|2_3c59dc048e88502...| State-gov|142766|Some-college| 10| Never-married| Exec-managerial| Own-child| White| Male| 0| 0| 40|United-States|50000| | |
|2_3c59dc048e88502...| Private|145119|Some-college| 10| Never-married| Other-service| Own-child|Asian-Pac-Islander| Male| 0| 0| 20|United-States|50000| | |
|2_3c59dc048e88502...| Private|211391|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 27|United-States|50000| | |
|2_b6d767d2f8ed5d2...| Private|214731| 10th| 6|Married-civ-spouse|Machine-op-inspct| Wife| White|Female| 0| 0| 40|United-States|30000| | |
|2_b6d767d2f8ed5d2...| Private|214134| 10th| 6| Never-married| Transport-moving| Not-in-family|Amer-Indian-Eskimo| Male| 0| 0| 84|United-States|30000| | |
|2_b6d767d2f8ed5d2...| Private|160120| 10th| 6| Never-married| Transport-moving| Own-child|Asian-Pac-Islander| Male| 0| 0| 30|United-States|30001| | |
|2_b6d767d2f8ed5d2...| Private|217961|Some-college| 10| Never-married| Transport-moving| Own-child| White| Male| 0|1719| 30|United-States|50000| | |
|2_37693cfc748049e...| Private|103064| Bachelors| 13| Never-married| Tech-support| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
|2_37693cfc748049e...| Private|181820|Some-college| 10| Never-married|Handlers-cleaners| Own-child| White| Male| 0| 0| 20|United-States|50000| | |
|2_37693cfc748049e...| Private|288771|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 30|United-States|50000| | |
|2_37693cfc748049e...| Private| 38707|Some-college| 10|Married-civ-spouse| Transport-moving| Husband| White| Male| 0| 0| 60|United-States|60000| | |
|2_1ff1de774005f8d...| Local-gov|289886| 11th| 7| Never-married| Other-service| Not-in-family|Asian-Pac-Islander| Male| 0| 0| 45|United-States|40001| | |
|2_1ff1de774005f8d...| Private|265567| 11th| 7| Never-married| Craft-repair| Own-child| White| Male| 0| 0| 35|United-States|40002| | |
|2_2e74f10e0327ad8...| Private|156516|Some-college| 10|Married-civ-spouse| Adm-clerical| Wife| Black|Female| 0|2377| 20|United-States|50000| | |
|2_2e74f10e0327ad8...| Private|158647|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
+--------------------+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
only showing top 20 rows | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment