garystafford/encounter_emr_spark.py

## encounter_emr_spark.py
# Purpose: Process encounters dataset using either Amazon EMR and AWS Glue with PySpark
# Author:  Gary A. Stafford (January 2022)

from pyspark.sql import SparkSession

table_name = "encounter_emr_spark"

spark = SparkSession \
    .builder \
    .appName(table_name) \
    .config("hive.metastore.client.factory.class",
            "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
    .config("hive.exec.dynamic.partition",
            "true") \
    .config("hive.exec.dynamic.partition.mode",
            "nonstrict") \
    .config("hive.exec.max.dynamic.partitions",
            "10000") \
    .config("hive.exec.max.dynamic.partitions.pernode",
            "10000") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("USE synthea_patient_big_data;")

sql_query_data = """
    SELECT DISTINCT
        id,
        patient,
        code,
        description,
        reasoncode,
        reasondescription,
        year(date) as year,
        month(date) as month,
        day(date) as day
    FROM encounters
    WHERE description='Encounter for symptom';
"""

df_data = spark.sql(sql_query_data)

df_data \
    .coalesce(1) \
    .write \
    .partitionBy("year", "month", "day") \
    .bucketBy(1, "patient") \
    .sortBy("patient") \
    .mode("overwrite") \
    .format("parquet") \
    .option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \
    .saveAsTable(name=table_name)

# update glue table
spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');")
	# Purpose: Process encounters dataset using either Amazon EMR and AWS Glue with PySpark
	# Author: Gary A. Stafford (January 2022)

	from pyspark.sql import SparkSession

	table_name = "encounter_emr_spark"

	spark = SparkSession \
	.builder \
	.appName(table_name) \
	.config("hive.metastore.client.factory.class",
	"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
	.config("hive.exec.dynamic.partition",
	"true") \
	.config("hive.exec.dynamic.partition.mode",
	"nonstrict") \
	.config("hive.exec.max.dynamic.partitions",
	"10000") \
	.config("hive.exec.max.dynamic.partitions.pernode",
	"10000") \
	.enableHiveSupport() \
	.getOrCreate()

	spark.sql("USE synthea_patient_big_data;")

	sql_query_data = """
	SELECT DISTINCT
	id,
	patient,
	code,
	description,
	reasoncode,
	reasondescription,
	year(date) as year,
	month(date) as month,
	day(date) as day
	FROM encounters
	WHERE description='Encounter for symptom';
	"""

	df_data = spark.sql(sql_query_data)

	df_data \
	.coalesce(1) \
	.write \
	.partitionBy("year", "month", "day") \
	.bucketBy(1, "patient") \
	.sortBy("patient") \
	.mode("overwrite") \
	.format("parquet") \
	.option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \
	.saveAsTable(name=table_name)

	# update glue table
	spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');")