garystafford/sinusitis-emr-spark.py

## sinusitis-emr-spark.py
# Purpose: Process data for sinusitis study using either Amazon EMR and AWS Glue with PySpark
# Author:  Gary A. Stafford (January 2022)

from pyspark.sql import SparkSession

table_name = "sinusitis_emr_spark"

spark = SparkSession \
    .builder \
    .appName(table_name) \
    .config("hive.metastore.client.factory.class",
            "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
    .enableHiveSupport() \
    .getOrCreate()

spark.sql("USE synthea_patient_big_data;")

sql_query_data = """
    SELECT DISTINCT
        patient,
        code,
        description,
        datediff(
            date(substr(start, 1, 10)),
            date(substr(birthdate, 1, 10))
        ) as condition_age,
        marital,
        race,
        ethnicity,
        gender
    FROM conditions as c, patients as p
    WHERE c.patient = p.id
        AND gender <> ''
        AND ethnicity <> ''
        AND race <> ''
        AND marital <> ''
        AND description LIKE '%sinusitis%';
"""

df_data = spark.sql(sql_query_data)

df_data \
    .coalesce(1) \
    .write \
    .bucketBy(1, "patient") \
    .sortBy("patient", "code") \
    .mode("overwrite") \
    .format("parquet") \
    .option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \
    .saveAsTable(name=table_name)

# update glue table
spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');")
	# Purpose: Process data for sinusitis study using either Amazon EMR and AWS Glue with PySpark
	# Author: Gary A. Stafford (January 2022)

	from pyspark.sql import SparkSession

	table_name = "sinusitis_emr_spark"

	spark = SparkSession \
	.builder \
	.appName(table_name) \
	.config("hive.metastore.client.factory.class",
	"com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory") \
	.enableHiveSupport() \
	.getOrCreate()

	spark.sql("USE synthea_patient_big_data;")

	sql_query_data = """
	SELECT DISTINCT
	patient,
	code,
	description,
	datediff(
	date(substr(start, 1, 10)),
	date(substr(birthdate, 1, 10))
	) as condition_age,
	marital,
	race,
	ethnicity,
	gender
	FROM conditions as c, patients as p
	WHERE c.patient = p.id
	AND gender <> ''
	AND ethnicity <> ''
	AND race <> ''
	AND marital <> ''
	AND description LIKE '%sinusitis%';
	"""

	df_data = spark.sql(sql_query_data)

	df_data \
	.coalesce(1) \
	.write \
	.bucketBy(1, "patient") \
	.sortBy("patient", "code") \
	.mode("overwrite") \
	.format("parquet") \
	.option("path", f"s3://databrew-demo-111222333444-us-east-1/{table_name}/") \
	.saveAsTable(name=table_name)

	# update glue table
	spark.sql(f"ALTER TABLE {table_name} SET TBLPROPERTIES ('classification'='parquet');")