vaquarkhan/read_write_pyspark_redshift.py

## read_write_pyspark_redshift.py
# Configuration needed of jars
%%configure
{
    "conf": {
        "spark.jars": "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.36.1060/RedshiftJDBC42-no-awssdk-1.2.36.1060.jar",
        "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.2,io.github.spark-redshift-community:spark-redshift_2.11:4.0.1"
    }
}

# define redshift connection info
username = "UN"
passw = "PW"
url = "jdbc:redshift://CLUSTER_URL"
path = url+"user="+username+"&"+"password="+passw
tempdir = "TEMP_DIR"

# Read via select statement
query = " "

from pyspark.sql import SQLContext

sc = spark
sql_context = SQLContext(sc)

df = (
    spark.read
        .format("io.github.spark_redshift_community.spark.redshift")
        .option("url", url)
        #.option("dbtable", "schema_table")
        .option("query", query)
        .option("forward_spark_s3_credentials", "true")
        .option("tempdir", tempdir)
        .load()
)

# Write it to a table "test.test"
schema_table = "test.test"

df.write \
  .format("io.github.spark_redshift_community.spark.redshift") \
  .option("url", path) \
  .option("dbtable", schema_table) \
  .option("forward_spark_s3_credentials", "true") \
  .option("tempdir", tempdir) \
  .mode("error") \
  .save()
	# Configuration needed of jars
	%%configure
	{
	"conf": {
	"spark.jars": "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/1.2.36.1060/RedshiftJDBC42-no-awssdk-1.2.36.1060.jar",
	"spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.2,io.github.spark-redshift-community:spark-redshift_2.11:4.0.1"
	}
	}

	# define redshift connection info
	username = "UN"
	passw = "PW"
	url = "jdbc:redshift://CLUSTER_URL"
	path = url+"user="+username+"&"+"password="+passw
	tempdir = "TEMP_DIR"

	# Read via select statement
	query = " "

	from pyspark.sql import SQLContext

	sc = spark
	sql_context = SQLContext(sc)

	df = (
	spark.read
	.format("io.github.spark_redshift_community.spark.redshift")
	.option("url", url)
	#.option("dbtable", "schema_table")
	.option("query", query)
	.option("forward_spark_s3_credentials", "true")
	.option("tempdir", tempdir)
	.load()
	)

	# Write it to a table "test.test"
	schema_table = "test.test"

	df.write \
	.format("io.github.spark_redshift_community.spark.redshift") \
	.option("url", path) \
	.option("dbtable", schema_table) \
	.option("forward_spark_s3_credentials", "true") \
	.option("tempdir", tempdir) \
	.mode("error") \
	.save()