Last active
March 8, 2022 02:11
-
-
Save garystafford/a289997c79e06d98c846c1fc0b1468b0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main.spark.demo | |
// Purpose: Process observations dataset using Spark on Amazon EMR with Scala | |
// Author: Gary A. Stafford | |
// Date: 2022-03-06 | |
import org.apache.spark.SparkContext | |
import org.apache.spark.sql.{DataFrame, SparkSession} | |
object Observations { | |
def main(args: Array[String]): Unit = { | |
val (spark: SparkSession, sc: SparkContext) = createSession | |
performELT(spark, sc) | |
} | |
private def createSession = { | |
val spark: SparkSession = SparkSession.builder | |
.appName("Observations ELT App") | |
.config("hive.metastore.client.factory.class", | |
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") | |
.config("hive.exec.dynamic.partition", | |
"true") | |
.config("hive.exec.dynamic.partition.mode", | |
"nonstrict") | |
.config("hive.exec.max.dynamic.partitions", | |
"10000") | |
.config("hive.exec.max.dynamic.partitions.pernode", | |
"10000") | |
.enableHiveSupport() | |
.getOrCreate() | |
val sc: SparkContext = spark.sparkContext | |
sc.setLogLevel("INFO") | |
(spark, sc) | |
} | |
private def performELT(spark: SparkSession, sc: SparkContext) = { | |
val tableName: String = sc.getConf.get("spark.executorEnv.TABLE_NAME") | |
val dataLakeBucket: String = sc.getConf.get("spark.executorEnv.DATA_LAKE_BUCKET") | |
spark.sql("USE synthea_patient_big_data;") | |
val sql_query_data: String = | |
""" | |
SELECT DISTINCT | |
patient, | |
encounter, | |
code, | |
description, | |
value, | |
units, | |
year(date) as year, | |
month(date) as month, | |
day(date) as day | |
FROM observations | |
WHERE date <> 'date'; | |
""" | |
val observationsDF: DataFrame = spark | |
.sql(sql_query_data) | |
observationsDF | |
.coalesce(1) | |
.write | |
.partitionBy("year", "month", "day") | |
.bucketBy(1, "patient") | |
.sortBy("patient") | |
.mode("overwrite") | |
.format("parquet") | |
.option("path", s"s3://${dataLakeBucket}/${tableName}/") | |
.saveAsTable(tableName = tableName) | |
spark.sql(s"ALTER TABLE ${tableName} SET TBLPROPERTIES ('classification'='parquet');") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment