Skip to content

Instantly share code, notes, and snippets.

@luizamboni
Last active July 27, 2018 21:56
Show Gist options
  • Save luizamboni/efe85ada85b9f51a87892f22297ad9d4 to your computer and use it in GitHub Desktop.
Save luizamboni/efe85ada85b9f51a87892f22297ad9d4 to your computer and use it in GitHub Desktop.
glue snippets
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
# aditional imports
from pyspark.sql.functions import sum
from awsglue.dynamicframe import DynamicFrame
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
# not needed in devendpoint
# sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
# not needed in devendpoint
# job.init(args['JOB_NAME'], args)
ddf = glueContext.create_dynamic_frame.from_catalog(
database = "xxx",
table_name = "clicks",
transformation_ctx = "datasource0"
)
# transform in spark dataFrame to make more complex transformations
df = ddf.toDF()
# again to dinamicFrame to Glue
dyf = DynamicFrame.fromDF(
dataframe = df.groupBy("account_id").agg(sum("cpc").alias("cost")),
name="test2",
glue_ctx= glueContext
)
datasink4 = glueContext.write_dynamic_frame.from_options(
frame = dyf,
connection_type = "s3",
connection_options = {
"path": "s3://{xxx}/glue/clicks-cost-json",
},
format = "json",
transformation_ctx = "datasink4"
)
# not needed in devendpoint
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment