Skip to content

Instantly share code, notes, and snippets.

@gnomezgrave
Created October 19, 2020 09:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gnomezgrave/1500c2f6e278d6c215759e794641e2e6 to your computer and use it in GitHub Desktop.
Save gnomezgrave/1500c2f6e278d6c215759e794641e2e6 to your computer and use it in GitHub Desktop.
Simple script to convert Parquet files to JSON using AWS Glue
import sys
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from awsglue.utils import getResolvedOptions
def load(context, bucket, prefix=""):
dynamic_frame = context.create_dynamic_frame_from_options(
"s3",
{
'paths': ['s3://{}/{}'.format(bucket, prefix)],
'recurse': True,
'groupFiles': 'inPartition',
'groupSize': 134217728
},
format='parquet'
)
return dynamic_frame
def main(context, args):
input_bucket_name = args['INPUT_BUCKET_NAME']
input_bucket_prefix = args['INPUT_BUCKET_PREFIX']
output_bucket_name = args['OUTPUT_BUCKET_NAME']
output_bucket_prefix = args['OUTPUT_BUCKET_PREFIX']
df = load(context, input_bucket_name, input_bucket_prefix) \
df.toDF().write.json(f's3://{output_bucket_name}/{output_bucket_prefix}')
context = GlueContext(SparkContext.getOrCreate())
job_arguments = getResolvedOptions(
sys.argv,
[
'INPUT_BUCKET_NAME',
'INPUT_BUCKET_PREFIX',
'OUTPUT_BUCKET_NAME',
'OUTPUT_BUCKET_PREFIX'
]
)
main(context, job_arguments)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment