Last active
May 9, 2017 12:47
-
-
Save ehabqadah/b9a3f7f0fa1478e4252e1c8cae42a31d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package iais.spark_dataset; | |
import java.math.BigInteger; | |
import java.security.MessageDigest; | |
import java.security.NoSuchAlgorithmException; | |
import java.util.HashMap; | |
import java.util.Map; | |
import org.apache.log4j.Level; | |
import org.apache.log4j.Logger; | |
import org.apache.spark.SparkConf; | |
import org.apache.spark.api.java.JavaSparkContext; | |
import org.apache.spark.sql.Dataset; | |
import org.apache.spark.sql.Row; | |
import org.apache.spark.sql.SparkSession; | |
import org.apache.spark.sql.api.java.UDF1; | |
import org.apache.spark.sql.types.DataTypes; | |
/** | |
* | |
* @author ehabqadah | |
* | |
*/ | |
public class App { | |
private static final String DATA_UCI_ADULTSET100_CSV = "data/uci_adultset100.csv"; | |
public static void main(String[] args) { | |
SparkSession sparkSession = getSparkSession(); | |
// Load the data set in DataFrame | |
Dataset<Row> df = sparkSession.read().options(getCsvOptions()) | |
.csv(DATA_UCI_ADULTSET100_CSV); | |
df.show(); | |
// Print the schema of the loaded DataFrame | |
df.printSchema(); | |
// Register the DataFrame as a temporary view | |
df.createOrReplaceTempView("uci_adultset"); | |
// Register the IntegerToStringUDF under the 'definedFunction' name | |
sparkSession.udf().register("definedFunction", | |
new IntegerToStringUDF(), DataTypes.StringType); | |
Dataset<Row> updatedDF = sparkSession | |
.sql("select definedFunction(_c0) as _c0,_c1,_c2,_c3 from uci_adultset"); | |
// Show the result | |
updatedDF.show(); | |
} | |
private static SparkSession getSparkSession() { | |
Logger.getLogger("org").setLevel(Level.OFF); | |
Logger.getLogger("akka").setLevel(Level.OFF); | |
Logger.getRootLogger().setLevel(Level.OFF); | |
SparkSession sparkSession = SparkSession.builder() | |
.appName("Spark DataSet API") | |
.config("spark.master", "local[*]").getOrCreate(); | |
return sparkSession; | |
} | |
private static Map<String, String> getCsvOptions() { | |
final Map<String, String> options = new HashMap<String, String>(); | |
options.put("delimiter", ";"); | |
options.put("inferSchema", "true"); | |
return options; | |
} | |
/** | |
* Custom UDF | |
* | |
* @author ehabqadah | |
* | |
*/ | |
public static class IntegerToStringUDF implements UDF1<Integer, String> { | |
@Override | |
public String call(Integer oldValue) throws Exception { | |
// compute the new value | |
String oldValueStr = oldValue.toString(); | |
String hashtext = md5Hash(oldValueStr); | |
return oldValueStr.substring(0, 1) + "_" + hashtext; | |
} | |
private String md5Hash(String oldValue) throws NoSuchAlgorithmException { | |
MessageDigest md = MessageDigest.getInstance("MD5"); | |
byte[] digest = md.digest(oldValue.getBytes()); | |
BigInteger bigInt = new BigInteger(1, digest); | |
String hashtext = bigInt.toString(16); | |
return hashtext; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties | |
//////////////// | |
///// original data set | |
//////////////// | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
|_c0| _c1| _c2| _c3|_c4| _c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12| _c13| _c14| | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
| 17| Private| 79682| 10th| 6| Never-married| Priv-house-serv|Other-relative| White| Male| 0| 0| 30|United-States|30000| | |
| 17|Self-emp-not-inc|368700| 11th| 7| Never-married| Farming-fishing| Own-child| White| Male| 0| 0| 10|United-States|40000| | |
| 17| Private|197850| 11th| 7| Never-married| Adm-clerical| Own-child|Asian-Pac-Islander|Female| 0| 0| 24|United-States|40000| | |
| 21| Private|161415|Some-college| 10| Never-married| Craft-repair| Not-in-family| White| Male| 0| 0| 50|United-States|50000| | |
| 21| Private|197050|Some-college| 10| Never-married| Other-service| Own-child| White|Female| 0| 0| 35|United-States|50000| | |
| 21| State-gov|142766|Some-college| 10| Never-married| Exec-managerial| Own-child| White| Male| 0| 0| 40|United-States|50000| | |
| 21| Private|145119|Some-college| 10| Never-married| Other-service| Own-child|Asian-Pac-Islander| Male| 0| 0| 20|United-States|50000| | |
| 21| Private|211391|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 27|United-States|50000| | |
| 22| Private|214731| 10th| 6|Married-civ-spouse|Machine-op-inspct| Wife| White|Female| 0| 0| 40|United-States|30000| | |
| 22| Private|214134| 10th| 6| Never-married| Transport-moving| Not-in-family|Amer-Indian-Eskimo| Male| 0| 0| 84|United-States|30000| | |
| 22| Private|160120| 10th| 6| Never-married| Transport-moving| Own-child|Asian-Pac-Islander| Male| 0| 0| 30|United-States|30001| | |
| 22| Private|217961|Some-college| 10| Never-married| Transport-moving| Own-child| White| Male| 0|1719| 30|United-States|50000| | |
| 23| Private|103064| Bachelors| 13| Never-married| Tech-support| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
| 23| Private|181820|Some-college| 10| Never-married|Handlers-cleaners| Own-child| White| Male| 0| 0| 20|United-States|50000| | |
| 23| Private|288771|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 30|United-States|50000| | |
| 23| Private| 38707|Some-college| 10|Married-civ-spouse| Transport-moving| Husband| White| Male| 0| 0| 60|United-States|60000| | |
| 24| Local-gov|289886| 11th| 7| Never-married| Other-service| Not-in-family|Asian-Pac-Islander| Male| 0| 0| 45|United-States|40001| | |
| 24| Private|265567| 11th| 7| Never-married| Craft-repair| Own-child| White| Male| 0| 0| 35|United-States|40002| | |
| 27| Private|156516|Some-college| 10|Married-civ-spouse| Adm-clerical| Wife| Black|Female| 0|2377| 20|United-States|50000| | |
| 27| Private|158647|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 40|United-States|50000| | |
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+ | |
only showing top 20 rows | |
/////////////////////// | |
// schema ///////////// | |
/////////////////////// | |
root | |
|-- _c0: integer (nullable = true) | |
|-- _c1: string (nullable = true) | |
|-- _c2: integer (nullable = true) | |
|-- _c3: string (nullable = true) | |
|-- _c4: integer (nullable = true) | |
|-- _c5: string (nullable = true) | |
|-- _c6: string (nullable = true) | |
|-- _c7: string (nullable = true) | |
|-- _c8: string (nullable = true) | |
|-- _c9: string (nullable = true) | |
|-- _c10: integer (nullable = true) | |
|-- _c11: integer (nullable = true) | |
|-- _c12: integer (nullable = true) | |
|-- _c13: string (nullable = true) | |
|-- _c14: integer (nullable = true) | |
/////// | |
///// update DataFrame | |
////////////////// | |
+--------------------+----------------+------+------------+ | |
| _c0| _c1| _c2| _c3| | |
+--------------------+----------------+------+------------+ | |
|1_70efdf2ec9b0860...| Private| 79682| 10th| | |
|1_70efdf2ec9b0860...|Self-emp-not-inc|368700| 11th| | |
|1_70efdf2ec9b0860...| Private|197850| 11th| | |
|2_3c59dc048e88502...| Private|161415|Some-college| | |
|2_3c59dc048e88502...| Private|197050|Some-college| | |
|2_3c59dc048e88502...| State-gov|142766|Some-college| | |
|2_3c59dc048e88502...| Private|145119|Some-college| | |
|2_3c59dc048e88502...| Private|211391|Some-college| | |
|2_b6d767d2f8ed5d2...| Private|214731| 10th| | |
|2_b6d767d2f8ed5d2...| Private|214134| 10th| | |
|2_b6d767d2f8ed5d2...| Private|160120| 10th| | |
|2_b6d767d2f8ed5d2...| Private|217961|Some-college| | |
|2_37693cfc748049e...| Private|103064| Bachelors| | |
|2_37693cfc748049e...| Private|181820|Some-college| | |
|2_37693cfc748049e...| Private|288771|Some-college| | |
|2_37693cfc748049e...| Private| 38707|Some-college| | |
|2_1ff1de774005f8d...| Local-gov|289886| 11th| | |
|2_1ff1de774005f8d...| Private|265567| 11th| | |
|2_2e74f10e0327ad8...| Private|156516|Some-college| | |
|2_2e74f10e0327ad8...| Private|158647|Some-college| | |
+--------------------+----------------+------+------------+ | |
only showing top 20 rows | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment