Skip to content

Instantly share code, notes, and snippets.

@vvgsrk
Created January 30, 2019 21:14
Show Gist options
  • Save vvgsrk/b861d914e8e6d350d108e4e1b579b939 to your computer and use it in GitHub Desktop.
Save vvgsrk/b861d914e8e6d350d108e4e1b579b939 to your computer and use it in GitHub Desktop.
AWS Glue gluepyspark python commands.
# Invoke Spark Shell
$ gluepyspark -v --properties-file /home/glue/glue_spark_shell.properties --packages com.databricks:spark-avro_2.11:4.0.0
# Import required classes
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame, DynamicFrameReader, DynamicFrameWriter, DynamicFrameCollection
from awsglue.job import Job
from pyspark.sql import SparkSession
# Create Glue Context from Spark Context
glueContext = GlueContext(SparkContext.getOrCreate())
# Read a table from Glue Catalog Database using glueContext
emp_dynamicframe = glueContext.create_dynamic_frame.from_catalog(database = "hr", table_name = "emp")
emp_dynamicframe.printSchema()
emp_df = emp_dynamicframe.toDF()
emp_df.show()
# Read parquet data from S3 using glueContext
emp_ddf = glueContext.create_dynamic_frame.from_options(connection_type = "parquet", connection_options = {"paths": ["s3://dev-datalake/hr/emp/yyyy=2018/mm=08/dd=30"]})
# Read avro data from S3 using glueContext
emp_ds = glueContext.getSource("s3", paths=["s3://dev-inbound-hr/emp/yyyy=2018/mm=09/dd=24"])
emp_ds.setFormat("avro")
emp_ddf = emp_ds.getFrame()
emp_ddf.count()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment