garystafford/Observations.scala

## Observations.scala
package main.spark.demo

// Purpose: Process observations dataset using Spark on Amazon EMR with Scala
// Author:  Gary A. Stafford
// Date: 2022-03-06

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SparkSession}

object Observations {
  def main(args: Array[String]): Unit = {

    val (spark: SparkSession, sc: SparkContext) = createSession

    performELT(spark, sc)
  }

  private def createSession = {
    val spark: SparkSession = SparkSession.builder
      .appName("Observations ELT App")
      .config("hive.metastore.client.factory.class",
        "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
      .config("hive.exec.dynamic.partition",
        "true")
      .config("hive.exec.dynamic.partition.mode",
        "nonstrict")
      .config("hive.exec.max.dynamic.partitions",
        "10000")
      .config("hive.exec.max.dynamic.partitions.pernode",
        "10000")
      .enableHiveSupport()
      .getOrCreate()

    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("INFO")

    (spark, sc)
  }

  private def performELT(spark: SparkSession, sc: SparkContext) = {
    val tableName: String = sc.getConf.get("spark.executorEnv.TABLE_NAME")
    val dataLakeBucket: String = sc.getConf.get("spark.executorEnv.DATA_LAKE_BUCKET")

    spark.sql("USE synthea_patient_big_data;")

    val sql_query_data: String =
      """
        SELECT DISTINCT
            patient,
            encounter,
            code,
            description,
            value,
            units,
            year(date) as year,
            month(date) as month,
            day(date) as day
        FROM observations
        WHERE date <> 'date';
      """

    val observationsDF: DataFrame = spark
      .sql(sql_query_data)

    observationsDF
      .coalesce(1)
      .write
      .partitionBy("year", "month", "day")
      .bucketBy(1, "patient")
      .sortBy("patient")
      .mode("overwrite")
      .format("parquet")
      .option("path", s"s3://${dataLakeBucket}/${tableName}/")
      .saveAsTable(tableName = tableName)

    spark.sql(s"ALTER TABLE ${tableName} SET TBLPROPERTIES ('classification'='parquet');")
  }
}
	package main.spark.demo

	// Purpose: Process observations dataset using Spark on Amazon EMR with Scala
	// Author: Gary A. Stafford
	// Date: 2022-03-06

	import org.apache.spark.SparkContext
	import org.apache.spark.sql.{DataFrame, SparkSession}

	object Observations {
	def main(args: Array[String]): Unit = {

	val (spark: SparkSession, sc: SparkContext) = createSession

	performELT(spark, sc)
	}

	private def createSession = {
	val spark: SparkSession = SparkSession.builder
	.appName("Observations ELT App")
	.config("hive.metastore.client.factory.class",
	"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory")
	.config("hive.exec.dynamic.partition",
	"true")
	.config("hive.exec.dynamic.partition.mode",
	"nonstrict")
	.config("hive.exec.max.dynamic.partitions",
	"10000")
	.config("hive.exec.max.dynamic.partitions.pernode",
	"10000")
	.enableHiveSupport()
	.getOrCreate()

	val sc: SparkContext = spark.sparkContext
	sc.setLogLevel("INFO")

	(spark, sc)
	}

	private def performELT(spark: SparkSession, sc: SparkContext) = {
	val tableName: String = sc.getConf.get("spark.executorEnv.TABLE_NAME")
	val dataLakeBucket: String = sc.getConf.get("spark.executorEnv.DATA_LAKE_BUCKET")

	spark.sql("USE synthea_patient_big_data;")

	val sql_query_data: String =
	"""
	SELECT DISTINCT
	patient,
	encounter,
	code,
	description,
	value,
	units,
	year(date) as year,
	month(date) as month,
	day(date) as day
	FROM observations
	WHERE date <> 'date';
	"""

	val observationsDF: DataFrame = spark
	.sql(sql_query_data)

	observationsDF
	.coalesce(1)
	.write
	.partitionBy("year", "month", "day")
	.bucketBy(1, "patient")
	.sortBy("patient")
	.mode("overwrite")
	.format("parquet")
	.option("path", s"s3://${dataLakeBucket}/${tableName}/")
	.saveAsTable(tableName = tableName)

	spark.sql(s"ALTER TABLE ${tableName} SET TBLPROPERTIES ('classification'='parquet');")
	}
	}