Skip to content

Instantly share code, notes, and snippets.

Last active January 6, 2021 18:56
Show Gist options
  • Save douglascodes/0c9cea7a50f614dca659c25710f29e34 to your computer and use it in GitHub Desktop.
Save douglascodes/0c9cea7a50f614dca659c25710f29e34 to your computer and use it in GitHub Desktop.
AN AWS Glue script for remote debugging example
# For
import uuid
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, IntegerType, StructField, StructType
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
import sys
from pyspark.context import SparkContext
import datetime
import sys
import pydevd_pycharm
pydevd_pycharm.settrace('', port=12345, stdoutToServer=True, stderrToServer=True)
NOW_STR = f"{}"
# Need a unique directory for each run. S3 can't overwrite datasets.
path_ext = str(uuid.uuid4())
OUTPUT_ROOT = "file:///PATH/TO/PROJECT/output/"
s3_output_path = OUTPUT_ROOT + path_ext + "/"
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
schema = StructType([
StructField("YEAR", IntegerType(), False),
StructField("GoogleKnowlege_Occupation", StringType(), False),
StructField("Show", StringType(), True),
StructField("Group", StringType(), True),
StructField("Raw_Guest_List", StringType(), True)
df ="com.databricks.spark.csv") \
.option("header", "true") \
.option('quote', '"') \
.option("encoding", "UTF-8") \
.option("multiLine", "true") \
.option("escape", "\"") \
.option("columnNameOfCorruptRecord", "_corrupt_column") \
.option("mode", "PERMISSIVE") \
.schema(schema) \
df.write \
# Partitioning not working on limited mac tests
# df.write \
# .partitionBy("YEAR", "Group") \
# .parquet(s3_output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment