Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save sandrodesouza/e00a800e94d4a7585c06142b483c54e2 to your computer and use it in GitHub Desktop.
Save sandrodesouza/e00a800e94d4a7585c06142b483c54e2 to your computer and use it in GitHub Desktop.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import json
import requests
from pyspark import sql
from uuid import UUID
from pyspark.sql.types import StringType, ArrayType
from pyspark.sql import functions as F
esNodes = "<ElasticSearch HTTPS endpoint>"
esResource = "<index>"
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# Checking on the deploy mode, either client or cluster
sc.getConf().get("spark.submit.deployMode")
# AWS ElasticSearch in VPC uses self-signed certificate,
# thus require both es.net.ssl.cert.allow.self.signed and es.net.ssl to true
df = spark.read.format("org.elasticsearch.spark.sql")\
.option("es.nodes",esNodes)\
.option("es.port", 443)\
.option("es.resource",esResource)\
.option("es.nodes.wan.only", True)\
.option("es.net.ssl.cert.allow.self.signed", "true")\
.option("es.net.ssl", "true").load()
df.printSchema()
job.commit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment