Last active
March 8, 2022 02:31
-
-
Save garystafford/a560b17247fbd80d0dc2d88524dbfc04 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Purpose: Process data for sinusitis study using either Amazon EMR and AWS Glue with PySpark | |
# Author: Gary A. Stafford (January 2022) | |
from pyspark.sql import SparkSession | |
table_name = "sinusitis_emr_spark" | |
spark = SparkSession \ | |
.builder \ | |
.appName(table_name) \ | |
.config("hive.metastore.client.factory.class", | |
"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \ | |
.enableHiveSupport() \ | |
.getOrCreate() | |
spark.sql("USE synthea_patient_big_data;") | |
sql_query_data = """ | |
SELECT DISTINCT | |
patient, | |
code, | |
description, | |
datediff( | |
date(substr(start, 1, 10)), | |
date(substr(birthdate, 1, 10)) | |
) as condition_age, | |
marital, | |
race, | |
ethnicity, | |
gender | |
FROM conditions as c, patients as p | |
WHERE c.patient = p.id | |
AND gender <> '' | |
AND ethnicity <> '' | |
AND race <> '' | |
AND marital <> '' | |
AND description LIKE '%sinusitis%'; | |
""" | |
df_data = spark.sql(sql_query_data) | |
df_data \ | |
.coalesce(1) \ | |
.write \ | |
.bucketBy(1, "patient") \ | |
.sortBy("patient", "code") \ | |
.mode("overwrite") \ | |
.format("parquet") \ | |
.option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \ | |
.saveAsTable(name=table_name) | |
# update glue table | |
spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment