asmaier/load_parquet_s3.py

## load_parquet_s3.py
#
# Some constants
#
aws_profile = "your_profile"
aws_region = "your_region"
s3_bucket = "your_bucket"

#
# Reading environment variables from aws credential file
#
import os
import configparser

config = configparser.ConfigParser()
config.read(os.path.expanduser("~/.aws/credentials"))

access_id = config.get(aws_profile, "aws_access_key_id")
access_key = config.get(aws_profile, "aws_secret_access_key")

#
# Configuring pyspark
#

# see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895
# and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

# If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache.
# (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148)
import pyspark
sc=pyspark.SparkContext()
# see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

# see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark
hadoop_conf=sc._jsc.hadoopConfiguration()
# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
hadoop_conf.set("fs.s3a.access.key", access_id)
hadoop_conf.set("fs.s3a.secret.key", access_key)

# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
hadoop_conf.set("fs.s3a.connection.maximum", "100000")

# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

#
# Downloading the parquet file
#
sql=pyspark.sql.SparkSession(sc)
path = s3_bucket + "your_path"
dataS3=sql.read.parquet("s3a://" + path)
	#
	# Some constants
	#
	aws_profile = "your_profile"
	aws_region = "your_region"
	s3_bucket = "your_bucket"

	#
	# Reading environment variables from aws credential file
	#
	import os
	import configparser

	config = configparser.ConfigParser()
	config.read(os.path.expanduser("~/.aws/credentials"))

	access_id = config.get(aws_profile, "aws_access_key_id")
	access_key = config.get(aws_profile, "aws_secret_access_key")

	#
	# Configuring pyspark
	#

	# see https://github.com/jupyter/docker-stacks/issues/127#issuecomment-214594895
	# and https://github.com/radanalyticsio/pyspark-s3-notebook/blob/master/s3-source-example.ipynb
	os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell"

	# If this doesn't work you might have to delete your ~/.ivy2 directory to reset your package cache.
	# (see https://github.com/databricks/spark-redshift/issues/244#issuecomment-239950148)
	import pyspark
	sc=pyspark.SparkContext()
	# see https://github.com/databricks/spark-redshift/issues/298#issuecomment-271834485
	sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

	# see https://stackoverflow.com/questions/28844631/how-to-set-hadoop-configuration-values-from-pyspark
	hadoop_conf=sc._jsc.hadoopConfiguration()
	# see https://stackoverflow.com/questions/43454117/how-do-you-use-s3a-with-spark-2-1-0-on-aws-us-east-2
	hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
	hadoop_conf.set("com.amazonaws.services.s3.enableV4", "true")
	hadoop_conf.set("fs.s3a.access.key", access_id)
	hadoop_conf.set("fs.s3a.secret.key", access_key)

	# see http://blog.encomiabile.it/2015/10/29/apache-spark-amazon-s3-and-apache-mesos/
	hadoop_conf.set("fs.s3a.connection.maximum", "100000")

	# see https://docs.aws.amazon.com/general/latest/gr/rande.html#s3_region
	hadoop_conf.set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

	#
	# Downloading the parquet file
	#
	sql=pyspark.sql.SparkSession(sc)
	path = s3_bucket + "your_path"
	dataS3=sql.read.parquet("s3a://" + path)