Skip to content

Instantly share code, notes, and snippets.

Created May 10, 2019 00:10
Show Gist options
  • Save pomadchin/f091ca74f65ffa536cb4b51ba641a81e to your computer and use it in GitHub Desktop.
Save pomadchin/f091ca74f65ffa536cb4b51ba641a81e to your computer and use it in GitHub Desktop.
Spark S3 credentials tip
import geotrellis.spark.util._
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.FileUtil
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration
import org.apache.spark._
import org.apache.spark.sql._
val conf = new SparkConf()
.setAppName("Test Context")
val sparkContext = SparkSession.builder().config(conf).getOrCreate()
implicit val ssc = sparkContext
val conf = ssc.sparkContext.hadoopConfiguration
// the best way to do it
conf.set("fs.s3.impl", classOf[org.apache.hadoop.fs.s3a.S3AFileSystem].getName)
conf.set("", classOf[com.amazonaws.auth.DefaultAWSCredentialsProviderChain].getName)
conf.set("fs.s3a.endpoint", "")
// it is possible also to specify keys this way
val credentialsProviderChain = new DefaultAWSCredentialsProviderChain
conf.set("fs.s3.impl", classOf[org.apache.hadoop.fs.s3a.S3AFileSystem].getName)
conf.set("fs.s3a.access.key", credentialsProviderChain.getCredentials.getAWSAccessKeyId)
conf.set("fs.s3a.secret.key", credentialsProviderChain.getCredentials.getAWSSecretKey)
conf.set("fs.s3a.endpoint", "")
// s3 is deprecated, s3n won't allow to specify the endpoint, so the only really good enough
// protocol is s3a, and it's the recommended way to interact with S3 using hadoop API.
val df ="header" -> "true", "inferSchema" -> "true")).csv("s3a://path/to.csv")
Copy link

are you running this on your laptop or on aws emr?

Copy link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment