Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save kangks/70eb7c884021e4ee8d6e1c3ac34630e2 to your computer and use it in GitHub Desktop.
Save kangks/70eb7c884021e4ee8d6e1c3ac34630e2 to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import json
import requests
from pyspark import sql
from uuid import UUID
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql import functions as F
esNodes = "<ElasticSearch HTTPS endpoint>"
esResource = "<index>"
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Checking on the deploy mode, either client or cluster
sc.getConf().get("spark.submit.deployMode")
# AWS ElasticSearch in VPC uses self-signed certificate,
# thus require both es.net.ssl.cert.allow.self.signed and es.net.ssl to true
df = spark.read.format("org.elasticsearch.spark.sql")\
.option("es.nodes",esNodes)\
.option("es.port", 443)\
.option("es.resource",esResource)\
.option("es.nodes.wan.only", True)\
.option("es.net.ssl.cert.allow.self.signed", "true")\
.option("es.net.ssl", "true").load()
df.printSchema()
job.commit()
@agodet
Copy link

agodet commented Apr 17, 2020

did you ever test your script on aws ?
what kind of role do you have to use on your glue job and access policies on your elasticsearch ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment