Skip to content

Instantly share code, notes, and snippets.

@douglascodes
Last active January 6, 2021 18:56
Show Gist options
  • Save douglascodes/0c9cea7a50f614dca659c25710f29e34 to your computer and use it in GitHub Desktop.
Save douglascodes/0c9cea7a50f614dca659c25710f29e34 to your computer and use it in GitHub Desktop.
AN AWS Glue script for remote debugging example
# For https://support.wharton.upenn.edu/help/glue-debugging
import uuid
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, StructField, StructType
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
import sys
from pyspark.context import SparkContext
import datetime
import sys
sys.path.append("pydevd-pycharm.egg")
import pydevd_pycharm
pydevd_pycharm.settrace('127.0.0.1', port=12345, stdoutToServer=True, stderrToServer=True)
NOW_STR = f"{datetime.datetime.now():%Y-%M-%d__%H_%M_%S_}"
# Need a unique directory for each run. S3 can't overwrite datasets.
path_ext = str(uuid.uuid4())
SOURCE_ROOT = "file:///PATH/TO/PROJECT/input"
OUTPUT_ROOT = "file:///PATH/TO/PROJECT/output/"
s3_output_path = OUTPUT_ROOT + path_ext + "/"
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
schema = StructType([
StructField("YEAR", IntegerType(), False),
StructField("GoogleKnowlege_Occupation", StringType(), False),
StructField("Show", StringType(), True),
StructField("Group", StringType(), True),
StructField("Raw_Guest_List", StringType(), True)
])
df = spark.read.format("com.databricks.spark.csv") \
.option("header", "true") \
.option('quote', '"') \
.option("encoding", "UTF-8") \
.option("multiLine", "true") \
.option("escape", "\"") \
.option("columnNameOfCorruptRecord", "_corrupt_column") \
.option("mode", "PERMISSIVE") \
.schema(schema) \
.csv(SOURCE_ROOT)
df.printSchema()
df.write \
.parquet(s3_output_path)
# Partitioning not working on limited mac tests
# df.write \
# .partitionBy("YEAR", "Group") \
# .parquet(s3_output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment