Skip to content

Instantly share code, notes, and snippets.

@vvgsrk
Last active August 23, 2019 02:33
Show Gist options
  • Save vvgsrk/1ea766ea63cba863023da07317588824 to your computer and use it in GitHub Desktop.
Save vvgsrk/1ea766ea63cba863023da07317588824 to your computer and use it in GitHub Desktop.
AWS Glue spark-shell scala commands
// Invoke Spark Shell
$ glue-spark-shell -v --properties-file /home/glue/glue_spark_shell.properties --packages com.databricks:spark-avro_2.11:4.0.0
// Import Required Classes
import org.apache.spark.SparkContext
import com.amazonaws.services.glue.GlueContext
import com.amazonaws.services.glue.DynamicFrame
import com.amazonaws.services.glue.DynamicRecord
import com.amazonaws.services.glue.MappingSpec
import com.amazonaws.services.glue.errors.CallSite
import com.amazonaws.services.glue.ChoiceOption
import com.amazonaws.services.glue.ResolveSpec
import com.amazonaws.services.glue.util.GlueArgParser
import com.amazonaws.services.glue.util.Job
import com.amazonaws.services.glue.util.JsonOptions
import com.amazonaws.services.glue.util.GlueExceptionWrapper
import com.amazonaws.services.glue.types._
// Intiate spark and glue context
@transient val spark: SparkContext = SparkContext.getOrCreate()
val glueContext: GlueContext = new GlueContext(spark)
// Read parquet data from S3 using glueContext
val emp_ddf = glueContext.getSource("parquet", JsonOptions(Map("paths" -> Set("s3://dev-datalake/hr/emp/yyyy=2018/mm=08/dd=30")))).getDynamicFrame()
// Another way of reading parquet data from S3 using glueContext
val emp_path = "s3://dev-datalake/hr/emp/yyyy=2018/mm=08/dd=30"
val emp_ddf = glueContext.getSource("parquet", JsonOptions(Map("path" -> emp_path))).getDynamicFrame()
val emp_df = emp_ddf.toDF()
emp_df show
// Read a table from Glue Catalog Database using glueContext
val hr_emp_dynamic_frame = glueContext.getCatalogSource(database = "hr", tableName = "emp").getDynamicFrame()
hr_emp_dynamic_frame.printSchema()
// Read a table from Glue Catalog Database using glueContext with diffrent set of parameters
val datasource0 = glueContext.getCatalogSource(database = "hr", tableName = "emp", redshiftTmpDir = "", transformationContext = "datasource0").getDynamicFrame()
datasource0.printSchema()
// Read avro data from S3 using glueContext
val emp_ddf = glueContext.getSourceWithFormat("s3", JsonOptions(Map("paths" -> Set("s3://dev-inbound-hr/emp/yyyy=2018/mm=09/dd=24"))), format = "avro").getDynamicFrame()
// Write data to s3 using glueContext
val hr_dept_dynamic_frame = glueContext.getCatalogSource(database = "hr", tableName = "dept").getDynamicFrame()
glueContext.getSinkWithFormat(connectionType = "s3", options = JsonOptions("""{"path": "s3://data-vvgsrk/hr_avro/dept"}"""), format = "avro").writeDynamicFrame(hr_dept_dynamic_frame)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment