Skip to content

Instantly share code, notes, and snippets.

@hakanilter
Created September 14, 2020 07:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save hakanilter/3fc20418d15fec6701e792fc08064a4f to your computer and use it in GitHub Desktop.
Save hakanilter/3fc20418d15fec6701e792fc08064a4f to your computer and use it in GitHub Desktop.
Default PySpark Settings
def get_spark(app_name):
"""
Creates Spark session with default parameters
"""
spark = SparkSession.builder \
.master(os.environ.get("SPARK_MASTER", "local[*]")) \
.appName(app_name) \
.config("spark.default.parallelism", 16) \
.config("spark.sql.adaptive.enabled", True) \
.config("spark.sql.warehouse.dir", SPARK_WAREHOUSE) \
.config("spark.sql.catalogImplementation", "hive") \
.config("spark.hadoop.fs.AbstractFileSystem.s3.impl", "org.apache.hadoop.fs.s3a.S3A") \
.config("spark.hadoop.fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3A") \
.config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
.config("spark.hadoop.fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
.config("spark.hadoop.fs.s3a.multipart.size", "104857600") \
.config("spark.driver.extraJavaOptions", " ".join([
"-Dcom.amazonaws.services.s3.enableV4=true",
f"-Dderby.system.home={DERBY_SYSTEM_HOME}"])) \
.config("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") \
.config("spark.executor.extraLibraryPath", "/usr/lib64/libsnappy.so") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.speculation", "false") \
.config("spark.worker.cleanup.enabled", "true") \
.config("spark.worker.cleanup.interval", "60") \
.config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.0") \
.enableHiveSupport() \
.getOrCreate()
# Other hadoop configs
hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
hadoop_config.set("mapreduce.fileoutputcommitter.algorithm.version", "2")
hadoop_config.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
hadoop_config.set("parquet.enable.summary-metadata", "false")
hadoop_config.set("dfs.client.read.shortcircuit.skip.checksum", "true")
return spark
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment