Last active
March 8, 2022 02:31
-
-
Save garystafford/179aca5a7c47962305c4413c1ec4f3f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Purpose: Process encounters dataset using either Amazon EMR and AWS Glue with PySpark | |
# Author: Gary A. Stafford (January 2022) | |
from pyspark.sql import SparkSession | |
table_name = "encounter_emr_spark" | |
spark = SparkSession \ | |
.builder \ | |
.appName(table_name) \ | |
.config("hive.metastore.client.factory.class", | |
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \ | |
.config("hive.exec.dynamic.partition", | |
"true") \ | |
.config("hive.exec.dynamic.partition.mode", | |
"nonstrict") \ | |
.config("hive.exec.max.dynamic.partitions", | |
"10000") \ | |
.config("hive.exec.max.dynamic.partitions.pernode", | |
"10000") \ | |
.enableHiveSupport() \ | |
.getOrCreate() | |
spark.sql("USE synthea_patient_big_data;") | |
sql_query_data = """ | |
SELECT DISTINCT | |
id, | |
patient, | |
code, | |
description, | |
reasoncode, | |
reasondescription, | |
year(date) as year, | |
month(date) as month, | |
day(date) as day | |
FROM encounters | |
WHERE description='Encounter for symptom'; | |
""" | |
df_data = spark.sql(sql_query_data) | |
df_data \ | |
.coalesce(1) \ | |
.write \ | |
.partitionBy("year", "month", "day") \ | |
.bucketBy(1, "patient") \ | |
.sortBy("patient") \ | |
.mode("overwrite") \ | |
.format("parquet") \ | |
.option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \ | |
.saveAsTable(name=table_name) | |
# update glue table | |
spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment