Skip to content

Instantly share code, notes, and snippets.

@trungpv1601
Created December 11, 2023 02:46
Show Gist options
  • Save trungpv1601/c5ef566a992abcc2745693742adddba0 to your computer and use it in GitHub Desktop.
Save trungpv1601/c5ef566a992abcc2745693742adddba0 to your computer and use it in GitHub Desktop.
Test partition job.py
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from datetime import datetime
args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
# Script generated for node Amazon S3
AmazonS3_node1702009815377 = glueContext.create_dynamic_frame.from_options(
format_options={"multiline": False},
connection_type="s3",
format="json",
connection_options={
"paths": ["s3://xxx/a/"],
"recurse": True,
},
transformation_ctx="AmazonS3_node1702009815377",
)
def AddDate(rec):
# Assuming the timestamp is an integer number of seconds since the epoch
ts = int(rec["creation_timestamp"])
# Convert the timestamp to a datetime object
dt_object = datetime.utcfromtimestamp(ts)
# Format the datetime object as a date string
rec["year"] = dt_object.strftime("%Y")
rec["month"] = dt_object.strftime("%m")
rec["day"] = dt_object.strftime("%d")
return rec
Mapped_dyF = Map.apply(frame = AmazonS3_node1702009815377, f = AddDate)
# Script generated for node Amazon S3
AmazonS3_node1702010667689 = glueContext.write_dynamic_frame.from_options(
frame=Mapped_dyF,
connection_type="s3",
format="glueparquet",
connection_options={
"path": "s3://xxx/b/",
"partitionKeys": ["year", "month", "day"],
},
format_options={"compression": "snappy"},
transformation_ctx="AmazonS3_node1702010667689",
)
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment