Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save vaquarkhan/a4d8b40ddd509129e1b4b63e8cc2ce5f to your computer and use it in GitHub Desktop.
Save vaquarkhan/a4d8b40ddd509129e1b4b63e8cc2ce5f to your computer and use it in GitHub Desktop.
Basics set up on Read and Write with Redshift in PySpark Env
# Configuration needed of jars
%%configure
{
"conf": {
"spark.jars": "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.36.1060/RedshiftJDBC42-no-awssdk-1.2.36.1060.jar",
"spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.2,io.github.spark-redshift-community:spark-redshift_2.11:4.0.1"
}
}
# define redshift connection info
username = "UN"
passw = "PW"
url = "jdbc:redshift://CLUSTER_URL"
path = url+"user="+username+"&"+"password="+passw
tempdir = "TEMP_DIR"
# Read via select statement
query = " "
from pyspark.sql import SQLContext
sc = spark
sql_context = SQLContext(sc)
df = (
spark.read
.format("io.github.spark_redshift_community.spark.redshift")
.option("url", url)
#.option("dbtable", "schema_table")
.option("query", query)
.option("forward_spark_s3_credentials", "true")
.option("tempdir", tempdir)
.load()
)
# Write it to a table "test.test"
schema_table = "test.test"
df.write \
.format("io.github.spark_redshift_community.spark.redshift") \
.option("url", path) \
.option("dbtable", schema_table) \
.option("forward_spark_s3_credentials", "true") \
.option("tempdir", tempdir) \
.mode("error") \
.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment