Skip to content

Instantly share code, notes, and snippets.

@ehabqadah
Last active May 9, 2017 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ehabqadah/b9a3f7f0fa1478e4252e1c8cae42a31d to your computer and use it in GitHub Desktop.
Save ehabqadah/b9a3f7f0fa1478e4252e1c8cae42a31d to your computer and use it in GitHub Desktop.
package iais.spark_dataset;
import java.math.BigInteger;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.Map;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;
/**
*
* @author ehabqadah
*
*/
public class App {
private static final String DATA_UCI_ADULTSET100_CSV = "data/uci_adultset100.csv";
public static void main(String[] args) {
SparkSession sparkSession = getSparkSession();
// Load the data set in DataFrame
Dataset<Row> df = sparkSession.read().options(getCsvOptions())
.csv(DATA_UCI_ADULTSET100_CSV);
df.show();
// Print the schema of the loaded DataFrame
df.printSchema();
// Register the DataFrame as a temporary view
df.createOrReplaceTempView("uci_adultset");
// Register the IntegerToStringUDF under the 'definedFunction' name
sparkSession.udf().register("definedFunction",
new IntegerToStringUDF(), DataTypes.StringType);
Dataset<Row> updatedDF = sparkSession
.sql("select definedFunction(_c0) as _c0,_c1,_c2,_c3 from uci_adultset");
// Show the result
updatedDF.show();
}
private static SparkSession getSparkSession() {
Logger.getLogger("org").setLevel(Level.OFF);
Logger.getLogger("akka").setLevel(Level.OFF);
Logger.getRootLogger().setLevel(Level.OFF);
SparkSession sparkSession = SparkSession.builder()
.appName("Spark DataSet API")
.config("spark.master", "local[*]").getOrCreate();
return sparkSession;
}
private static Map<String, String> getCsvOptions() {
final Map<String, String> options = new HashMap<String, String>();
options.put("delimiter", ";");
options.put("inferSchema", "true");
return options;
}
/**
* Custom UDF
*
* @author ehabqadah
*
*/
public static class IntegerToStringUDF implements UDF1<Integer, String> {
@Override
public String call(Integer oldValue) throws Exception {
// compute the new value
String oldValueStr = oldValue.toString();
String hashtext = md5Hash(oldValueStr);
return oldValueStr.substring(0, 1) + "_" + hashtext;
}
private String md5Hash(String oldValue) throws NoSuchAlgorithmException {
MessageDigest md = MessageDigest.getInstance("MD5");
byte[] digest = md.digest(oldValue.getBytes());
BigInteger bigInt = new BigInteger(1, digest);
String hashtext = bigInt.toString(16);
return hashtext;
}
}
}
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
////////////////
///// original data set
////////////////
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+
|_c0| _c1| _c2| _c3|_c4| _c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12| _c13| _c14|
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+
| 17| Private| 79682| 10th| 6| Never-married| Priv-house-serv|Other-relative| White| Male| 0| 0| 30|United-States|30000|
| 17|Self-emp-not-inc|368700| 11th| 7| Never-married| Farming-fishing| Own-child| White| Male| 0| 0| 10|United-States|40000|
| 17| Private|197850| 11th| 7| Never-married| Adm-clerical| Own-child|Asian-Pac-Islander|Female| 0| 0| 24|United-States|40000|
| 21| Private|161415|Some-college| 10| Never-married| Craft-repair| Not-in-family| White| Male| 0| 0| 50|United-States|50000|
| 21| Private|197050|Some-college| 10| Never-married| Other-service| Own-child| White|Female| 0| 0| 35|United-States|50000|
| 21| State-gov|142766|Some-college| 10| Never-married| Exec-managerial| Own-child| White| Male| 0| 0| 40|United-States|50000|
| 21| Private|145119|Some-college| 10| Never-married| Other-service| Own-child|Asian-Pac-Islander| Male| 0| 0| 20|United-States|50000|
| 21| Private|211391|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 27|United-States|50000|
| 22| Private|214731| 10th| 6|Married-civ-spouse|Machine-op-inspct| Wife| White|Female| 0| 0| 40|United-States|30000|
| 22| Private|214134| 10th| 6| Never-married| Transport-moving| Not-in-family|Amer-Indian-Eskimo| Male| 0| 0| 84|United-States|30000|
| 22| Private|160120| 10th| 6| Never-married| Transport-moving| Own-child|Asian-Pac-Islander| Male| 0| 0| 30|United-States|30001|
| 22| Private|217961|Some-college| 10| Never-married| Transport-moving| Own-child| White| Male| 0|1719| 30|United-States|50000|
| 23| Private|103064| Bachelors| 13| Never-married| Tech-support| Not-in-family| White|Female| 0| 0| 40|United-States|50000|
| 23| Private|181820|Some-college| 10| Never-married|Handlers-cleaners| Own-child| White| Male| 0| 0| 20|United-States|50000|
| 23| Private|288771|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 30|United-States|50000|
| 23| Private| 38707|Some-college| 10|Married-civ-spouse| Transport-moving| Husband| White| Male| 0| 0| 60|United-States|60000|
| 24| Local-gov|289886| 11th| 7| Never-married| Other-service| Not-in-family|Asian-Pac-Islander| Male| 0| 0| 45|United-States|40001|
| 24| Private|265567| 11th| 7| Never-married| Craft-repair| Own-child| White| Male| 0| 0| 35|United-States|40002|
| 27| Private|156516|Some-college| 10|Married-civ-spouse| Adm-clerical| Wife| Black|Female| 0|2377| 20|United-States|50000|
| 27| Private|158647|Some-college| 10| Never-married| Adm-clerical| Not-in-family| White|Female| 0| 0| 40|United-States|50000|
+---+----------------+------+------------+---+------------------+-----------------+--------------+------------------+------+----+----+----+-------------+-----+
only showing top 20 rows
///////////////////////
// schema /////////////
///////////////////////
root
|-- _c0: integer (nullable = true)
|-- _c1: string (nullable = true)
|-- _c2: integer (nullable = true)
|-- _c3: string (nullable = true)
|-- _c4: integer (nullable = true)
|-- _c5: string (nullable = true)
|-- _c6: string (nullable = true)
|-- _c7: string (nullable = true)
|-- _c8: string (nullable = true)
|-- _c9: string (nullable = true)
|-- _c10: integer (nullable = true)
|-- _c11: integer (nullable = true)
|-- _c12: integer (nullable = true)
|-- _c13: string (nullable = true)
|-- _c14: integer (nullable = true)
///////
///// update DataFrame
//////////////////
+--------------------+----------------+------+------------+
| _c0| _c1| _c2| _c3|
+--------------------+----------------+------+------------+
|1_70efdf2ec9b0860...| Private| 79682| 10th|
|1_70efdf2ec9b0860...|Self-emp-not-inc|368700| 11th|
|1_70efdf2ec9b0860...| Private|197850| 11th|
|2_3c59dc048e88502...| Private|161415|Some-college|
|2_3c59dc048e88502...| Private|197050|Some-college|
|2_3c59dc048e88502...| State-gov|142766|Some-college|
|2_3c59dc048e88502...| Private|145119|Some-college|
|2_3c59dc048e88502...| Private|211391|Some-college|
|2_b6d767d2f8ed5d2...| Private|214731| 10th|
|2_b6d767d2f8ed5d2...| Private|214134| 10th|
|2_b6d767d2f8ed5d2...| Private|160120| 10th|
|2_b6d767d2f8ed5d2...| Private|217961|Some-college|
|2_37693cfc748049e...| Private|103064| Bachelors|
|2_37693cfc748049e...| Private|181820|Some-college|
|2_37693cfc748049e...| Private|288771|Some-college|
|2_37693cfc748049e...| Private| 38707|Some-college|
|2_1ff1de774005f8d...| Local-gov|289886| 11th|
|2_1ff1de774005f8d...| Private|265567| 11th|
|2_2e74f10e0327ad8...| Private|156516|Some-college|
|2_2e74f10e0327ad8...| Private|158647|Some-college|
+--------------------+----------------+------+------------+
only showing top 20 rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment