dslaw/spark-misc.md

## spark-misc.md

      
    Raw
  

              spark-misc.md
            
          
    Add packages to the Spark session from within python. They'll be automatically downloaded if they're not already available.
# Using `JsonSerDe` as an example.
os.environ["PYSPARK_SUBMIT_ARGS"] = (
   '--packages "org.apache.hive.hcatalog.data.JsonSerDe" pyspark-shell'
)
spark = SparkSession.builder.config().getOrCreate()
Initializing a spark session for testing purposes, with S3 and Hive enabled, pointing to a mock S3 server.
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pytest


@pytest.fixture(scope="session")
def spark():
    s3_endpoint = "moto"
    
    # https://stackoverflow.com/questions/50183915/how-can-i-read-from-s3-in-pyspark-running-in-local-mode
    os.environ["PYSPARK_SUBMIT_ARGS"] = (
        '--packages "org.apache.hadoop:hadoop-aws:2.7.3" '
        'pyspark-shell'
    )
    
    conf = (
      SparkConf()
      .setMaster("local[*]")
      .set("spark.sql.catalogImplementation", "in-memory")
    )

    s = (
      SparkSession
      .builder
      .config(conf=conf)
      .enableHiveSupport()
      .getOrCreate()
    )

    hadoop_conf = s.sparkContext._jsc.hadoopConfiguration()
    hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # Access key/secret don't matter, just as long as they're set.
    hadoop_conf.set("fs.s3.access.key", "mock")
    hadoop_conf.set("fs.s3.scret.key", "mock")
    hadoop_conf.set("fs.s3a.endpoint", s3_endpoint)
    
    yield s
    s.stop()