- Ensure you have java installed (in my case it is version 19). But you can install fro example 17
$ java -version || brew install --cask zulu17
openjdk version "19.0.1" 2022-10-18
OpenJDK Runtime Environment (build 19.0.1+10-21)
OpenJDK 64-Bit Server VM (build 19.0.1+10-21, mixed mode, sharing)
- Download and extract spark with the hadoop
$ mkdir ~/spark-3.4 && curl -SL https://dlcdn.apache.org/spark/spark-3.4.0/spark-3.4.0-bin-hadoop3.tgz | tar xvzf - -C ~/spark-3.4
- Clean mvn/graddle cache (this is important as sometimes it could cause the dependency download errors)
rm -frv ~/.ivy
rm -frv ~/.m2
- Configure spark
$ mkdir spark-3.4/spark-3.4.0-bin-hadoop3/conf/
$ cat << EOF > spark-3.4/spark-3.4.0-bin-hadoop3/conf/spark-defaults.conf
spark.hadoop.fs.s3a.aws.credentials.provider com.amazonaws.auth.profile.ProfileCredentialsProvider
spark.ui.enabled true
spark.jars.packages org.apache.hadoop:hadoop-aws:3.3.4,org.apache.hadoop:hadoop-common:3.3.4
EOF
- Add env vars to the .bashrc or .zshrc etc.:
export SPARK_HOME=$HOME/spark-3.4/spark-3.4.0-bin-hadoop3
export PATH=$SPARK_HOME/bin:$PATH
export PYSPARK_PYTHON=$(which python3.11)
export PYSPARK_DRIVER_PYTHON=$PYSPARK_PYTHON