Last active
April 6, 2020 14:36
-
-
Save BrendanSchell/2fe2eb2b4f5cc9b68b344e8b9366aa6f to your computer and use it in GitHub Desktop.
Runs and distributes an activated python environment to the spark cluster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# set spark home to EMR default | |
export SPARK_HOME='/usr/lib/spark/' | |
# if want to use jupyter instead can specify "jupyter" as first arg | |
if [ -z $1]; then | |
driver_python=$(which python) | |
else | |
driver_python=$1 | |
fi | |
# ex: if $1 is jupyter, then options in {"notebook","lab"} | |
if [ -z $2]; then | |
driver_python_opts="" | |
else | |
driver_python_opts=$2 | |
fi | |
# get directory of venv (assumed to be activated) | |
VENV_PATH=$(dirname $(dirname "$(which python)")) | |
# get path of current directory to switch back to after | |
CUR_PATH=$(pwd) | |
# get venv folder name | |
VENV_NAME=$(basename $VENV_PATH) | |
# get folder containing venv | |
ENV_PARENT_PATH=$(dirname $VENV_PATH) | |
# switch to venv parent and zip up venv so can be shipped | |
cd $ENV_PARENT_PATH | |
zip -FSr "${VENV_NAME}.zip" "${VENV_NAME}/" | |
# switch back to starting directory and start pyspark using local | |
# venv for driver and shipped venv for executors | |
cd $CUR_PATH | |
PYSPARK_DRIVER_PYTHON=$driver_python \ | |
PYSPARK_DRIVER_PYTHON_OPTS=$driver_python_opts \ | |
pyspark --conf "spark.pyspark.driver.python=${driver_python}" \ | |
--conf "spark.pyspark.python=./${VENV_NAME}_zip/${VENV_NAME}/bin/python" \ | |
--archives "${ENV_PARENT_PATH}/${VENV_NAME}.zip#${VENV_NAME}_zip" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment