cosmincatalin/install-jupyter.sh

## readme.md

      
    Raw
  

              readme.md
            
          
    AWS EMR bootstraps to install Jupyter (R, SparkR, Python 2, Python 3, PySpark)

Use these bootstraps if you want to run Jupyter notebooks at scale using Spark or if you just want to run it on Amazon EMR.
The default bootstrap installs the following kernels:

Python 2
PySpark (Python 2)
Python 3
PySpark (Python 3)

If you want R and SparkR, you'll have to use the second bootstrap as well.
Warning

This bootstrap will start a Jupyter server without any kind of authentication. Do not use it to create EMR clusters outside secure VPC.
Other interesting material

Take a look at my other Related gists:

AWS EMR bootstrap to install RStudio Server along with sparklyr
AWS EMR bootstrap to install R packages from CRAN
Shiny community server with OAuth on Amazon EC2


## install-jupyter.sh
#!/bin/bash

MINICONDA_VERSION="4.3.21"
PANDAS_VERSION="0.20.3"
SCIKIT_VERSION="0.19.0"

while [[ $# > 1 ]]; do
    key="$1"

    case $key in
        # Where on S3 are the notebooks located.
        # This path needs to exist.
        # Eg: --notebook-dir /mnt/my-notebooks/
        --notebook-dir)
            NOTEBOOK_DIR="$2"
            shift
            ;;
        --miniconda-version)
            MINICONDA_VERSION="$2"
            shift
            ;;
        --pandas-version)
            PANDAS_VERSION="$2"
            shift
            ;;
        --scikit-version)
            SCIKIT_VERSION="$2"
            shift
            ;;
        *)
            echo "Unknown option: ${key}"
            exit 1;
    esac
    shift
done

# Install conda and friends
wget https://repo.continuum.io/miniconda/Miniconda3-$MINICONDA_VERSION-Linux-x86_64.sh -O /mnt/miniconda.sh
/bin/bash /mnt/miniconda.sh -b -p /mnt/conda
rm /mnt/miniconda.sh

echo -e '\nexport PATH=/mnt/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc

conda config --set always_yes yes --set changeps1 no
conda config -f --add channels conda-forge
conda config -f --add channels defaults

conda install hdfs3 findspark ujson jsonschema toolz boto3 py4j numpy pandas==$PANDAS_VERSION conda=$MINICONDA_VERSION jupyterlab scikit-learn==$SCIKIT_VERSION
conda install matplotlib plotly bokeh seaborn ipywidgets ipyvolume jupyter_contrib_nbextensions
conda install libgcc opencv fastparquet h5py

grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
if [ $? -eq 0 ];
then

  pip install --upgrade --ignore-installed mxnet
  pip install --upgrade --ignore-installed mxnet-cu91
  pip install --upgrade --ignore-installed tensorflow==1.5.0
  pip install --upgrade --ignore-installed tensorflow-gpu==1.5.0
  pip install --upgrade --ignore-installed keras

  # Jupyter configuration. This is not using the defaults.
  mkdir -p ~/.jupyter
  touch ~/.jupyter/jupyter_notebook_config.py
  echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
  echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py
  echo "c.NotebookApp.ip = '*'" >> ~/.jupyter/jupyter_notebook_config.py
  echo "c.NotebookApp.notebook_dir = '$NOTEBOOK_DIR'" >> ~/.jupyter/jupyter_notebook_config.py
  # This will not work properly and there is no documentation for it
  # echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py

  # PySpark for Python 3
  mkdir -p /mnt/conda/share/jupyter/kernels/python3-pyspark
  cat <<EOF > /mnt/conda/share/jupyter/kernels/python3-pyspark/kernel.json
{
 "argv": [
  "/mnt/conda/bin/python3.6",
  "-m",
  "ipykernel_launcher",
  "-f",
  "{connection_file}"
 ],
 "display_name": "Python 3 - PySpark",
 "language": "python",
 "env": {
  "SPARK_HOME": "/usr/lib/spark",
  "PYSPARK_PYTHON": "/mnt/conda/bin/python3.6",
  "PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
  "PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
  "PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
 }
}
EOF

  # Install the kernel for Python 2
  /mnt/conda/bin/conda create -n ipykernel_py2 python=2 ipykernel
  source activate ipykernel_py2
  python -m ipykernel install --user

  # Install the kernel for PySpark with Python 2
  mkdir -p /mnt/conda/share/jupyter/kernels/python2-pyspark
  cat <<EOF > /mnt/conda/share/jupyter/kernels/python2-pyspark/kernel.json
{
 "argv": [
  "/mnt/conda/envs/ipykernel_py2/bin/python",
  "-m",
  "ipykernel_launcher",
  "-f",
  "{connection_file}"
 ],
 "display_name": "Python 2 - PySpark",
 "language": "python",
 "env": {
  "SPARK_HOME": "/usr/lib/spark",
  "PYSPARK_PYTHON": "/mnt/conda/envs/ipykernel_py2/bin/python",
  "PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
  "PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
  "PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
 }
}
EOF

  # Setup the Jupyter daemon
  sudo cat <<EOF > ~/jupyter.conf
description "Jupyter"
author "Cosmin Catalin Sanda"
start on runlevel [2345]
stop on runlevel [016]
respawn
respawn limit 0 10
console output
chdir $NOTEBOOK_DIR
exec start-stop-daemon -v --start -c hadoop --exec /mnt/conda/bin/jupyter notebook
EOF

  jupyter nbextension enable execute_time/ExecuteTime
  jupyter nbextension enable freeze/main
  jupyter nbextension enable hide_input/main
  jupyter nbextension enable table_beautifier/main
  jupyter nbextension enable spellchecker/main
  jupyter nbextension enable python-markdown/main
  jupyter nbextension enable scratchpad/main
  jupyter nbextension enable varInspector/main

  sudo mv ~/jupyter.conf /etc/init/
  sudo chown root:root /etc/init/jupyter.conf

  sudo initctl reload-configuration

  # Start Jupyter daemon
  sudo initctl start jupyter

fi

## install-r-kernel.sh
#!/bin/bash

SPARK="2.3.0"

while [[ $# > 1 ]]; do
    key="$1"

    case $key in
        # The version of Spark to install for SparkR
        --spark)
            SPARK="$2"
            shift
            ;;
        *)
            echo "Unknown option: ${key}"
            exit 1;
    esac
    shift
done

source ~/.bashrc

# This is the user lib
mkdir -p ~/R/library

sudo yum install -y libcurl-devel openssl-devel openssl libcurl-devel libssh2-devel
sudo R -e "install.packages('devtools', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('git2r', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('repr', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('IRdisplay', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('crayon', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('pbdZMQ', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "install.packages('tidyverse', repos='https://cran.rstudio.com/')" 1>&2
sudo R -e "devtools::install_github('apache/spark@v${SPARK}', subdir='R/pkg')" 1>&2
sudo R -e "devtools::install_github('IRkernel/IRkernel')" 1>&2


grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
if [ $? -eq 0 ];
then
    # There is actually no point in making another kernel
    # R -e "IRkernel::installspec()" 1>&2

    mkdir -p /mnt/conda/share/jupyter/kernels/r-sparkr
  cat <<EOF > /mnt/conda/share/jupyter/kernels/r-sparkr/kernel.json
{
 "argv": ["/usr/lib64/R/bin/R", "--slave", "-e", "IRkernel::main()", "--args", "{connection_file}"],
 "display_name":"R - SparkR",
 "language":"R",
 "env": {
  "SPARK_HOME": "/usr/lib/spark",
  "R_LIBS_USER": "~/R/library"
 }
}
EOF

fi
	#!/bin/bash

	MINICONDA_VERSION="4.3.21"
	PANDAS_VERSION="0.20.3"
	SCIKIT_VERSION="0.19.0"

	while [[ $# > 1 ]]; do
	key="$1"

	case $key in
	# Where on S3 are the notebooks located.
	# This path needs to exist.
	# Eg: --notebook-dir /mnt/my-notebooks/
	--notebook-dir)
	NOTEBOOK_DIR="$2"
	shift
	;;
	--miniconda-version)
	MINICONDA_VERSION="$2"
	shift
	;;
	--pandas-version)
	PANDAS_VERSION="$2"
	shift
	;;
	--scikit-version)
	SCIKIT_VERSION="$2"
	shift
	;;
	*)
	echo "Unknown option: ${key}"
	exit 1;
	esac
	shift
	done

	# Install conda and friends
	wget https://repo.continuum.io/miniconda/Miniconda3-$MINICONDA_VERSION-Linux-x86_64.sh -O /mnt/miniconda.sh
	/bin/bash /mnt/miniconda.sh -b -p /mnt/conda
	rm /mnt/miniconda.sh

	echo -e '\nexport PATH=/mnt/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc

	conda config --set always_yes yes --set changeps1 no
	conda config -f --add channels conda-forge
	conda config -f --add channels defaults

	conda install hdfs3 findspark ujson jsonschema toolz boto3 py4j numpy pandas==$PANDAS_VERSION conda=$MINICONDA_VERSION jupyterlab scikit-learn==$SCIKIT_VERSION
	conda install matplotlib plotly bokeh seaborn ipywidgets ipyvolume jupyter_contrib_nbextensions
	conda install libgcc opencv fastparquet h5py

	grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
	if [ $? -eq 0 ];
	then

	pip install --upgrade --ignore-installed mxnet
	pip install --upgrade --ignore-installed mxnet-cu91
	pip install --upgrade --ignore-installed tensorflow==1.5.0
	pip install --upgrade --ignore-installed tensorflow-gpu==1.5.0
	pip install --upgrade --ignore-installed keras

	# Jupyter configuration. This is not using the defaults.
	mkdir -p ~/.jupyter
	touch ~/.jupyter/jupyter_notebook_config.py
	echo "c.NotebookApp.token = ''" >> ~/.jupyter/jupyter_notebook_config.py
	echo "c.NotebookApp.open_browser = False" >> ~/.jupyter/jupyter_notebook_config.py
	echo "c.NotebookApp.ip = '*'" >> ~/.jupyter/jupyter_notebook_config.py
	echo "c.NotebookApp.notebook_dir = '$NOTEBOOK_DIR'" >> ~/.jupyter/jupyter_notebook_config.py
	# This will not work properly and there is no documentation for it
	# echo "c.ContentsManager.checkpoints_kwargs = {'root_dir': '.checkpoints'}" >> ~/.jupyter/jupyter_notebook_config.py

	# PySpark for Python 3
	mkdir -p /mnt/conda/share/jupyter/kernels/python3-pyspark
	cat <<EOF > /mnt/conda/share/jupyter/kernels/python3-pyspark/kernel.json
	{
	"argv": [
	"/mnt/conda/bin/python3.6",
	"-m",
	"ipykernel_launcher",
	"-f",
	"{connection_file}"
	],
	"display_name": "Python 3 - PySpark",
	"language": "python",
	"env": {
	"SPARK_HOME": "/usr/lib/spark",
	"PYSPARK_PYTHON": "/mnt/conda/bin/python3.6",
	"PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
	"PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
	"PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
	}
	}
	EOF

	# Install the kernel for Python 2
	/mnt/conda/bin/conda create -n ipykernel_py2 python=2 ipykernel
	source activate ipykernel_py2
	python -m ipykernel install --user

	# Install the kernel for PySpark with Python 2
	mkdir -p /mnt/conda/share/jupyter/kernels/python2-pyspark
	cat <<EOF > /mnt/conda/share/jupyter/kernels/python2-pyspark/kernel.json
	{
	"argv": [
	"/mnt/conda/envs/ipykernel_py2/bin/python",
	"-m",
	"ipykernel_launcher",
	"-f",
	"{connection_file}"
	],
	"display_name": "Python 2 - PySpark",
	"language": "python",
	"env": {
	"SPARK_HOME": "/usr/lib/spark",
	"PYSPARK_PYTHON": "/mnt/conda/envs/ipykernel_py2/bin/python",
	"PYTHONPATH": "/usr/lib/spark/python:/usr/lib/spark/python/lib/py4j-0.10.4-src.zip",
	"PYTHONSTARTUP": "/usr/lib/spark/python/pyspark/shell.py",
	"PYSPARK_SUBMIT_ARGS": "--master yarn-client pyspark-shell"
	}
	}
	EOF

	# Setup the Jupyter daemon
	sudo cat <<EOF > ~/jupyter.conf
	description "Jupyter"
	author "Cosmin Catalin Sanda"
	start on runlevel [2345]
	stop on runlevel [016]
	respawn
	respawn limit 0 10
	console output
	chdir $NOTEBOOK_DIR
	exec start-stop-daemon -v --start -c hadoop --exec /mnt/conda/bin/jupyter notebook
	EOF

	jupyter nbextension enable execute_time/ExecuteTime
	jupyter nbextension enable freeze/main
	jupyter nbextension enable hide_input/main
	jupyter nbextension enable table_beautifier/main
	jupyter nbextension enable spellchecker/main
	jupyter nbextension enable python-markdown/main
	jupyter nbextension enable scratchpad/main
	jupyter nbextension enable varInspector/main

	sudo mv ~/jupyter.conf /etc/init/
	sudo chown root:root /etc/init/jupyter.conf

	sudo initctl reload-configuration

	# Start Jupyter daemon
	sudo initctl start jupyter

	fi
	#!/bin/bash

	SPARK="2.3.0"

	while [[ $# > 1 ]]; do
	key="$1"

	case $key in
	# The version of Spark to install for SparkR
	--spark)
	SPARK="$2"
	shift
	;;
	*)
	echo "Unknown option: ${key}"
	exit 1;
	esac
	shift
	done

	source ~/.bashrc

	# This is the user lib
	mkdir -p ~/R/library

	sudo yum install -y libcurl-devel openssl-devel openssl libcurl-devel libssh2-devel
	sudo R -e "install.packages('devtools', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('git2r', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('repr', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('IRdisplay', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('crayon', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('pbdZMQ', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "install.packages('tidyverse', repos='https://cran.rstudio.com/')" 1>&2
	sudo R -e "devtools::install_github('apache/spark@v${SPARK}', subdir='R/pkg')" 1>&2
	sudo R -e "devtools::install_github('IRkernel/IRkernel')" 1>&2


	grep -Fq "\"isMaster\": true" /mnt/var/lib/info/instance.json
	if [ $? -eq 0 ];
	then
	# There is actually no point in making another kernel
	# R -e "IRkernel::installspec()" 1>&2

	mkdir -p /mnt/conda/share/jupyter/kernels/r-sparkr
	cat <<EOF > /mnt/conda/share/jupyter/kernels/r-sparkr/kernel.json
	{
	"argv": ["/usr/lib64/R/bin/R", "--slave", "-e", "IRkernel::main()", "--args", "{connection_file}"],
	"display_name":"R - SparkR",
	"language":"R",
	"env": {
	"SPARK_HOME": "/usr/lib/spark",
	"R_LIBS_USER": "~/R/library"
	}
	}
	EOF

	fi