|
#!/bin/bash |
|
# |
|
# SparkLab build script |
|
# |
|
# Created by M. Massenzio, 2015-01-22 |
|
# |
|
# ssh ubuntu@host:port |
|
|
|
declare -r VENVS="/data/virtualenvs" |
|
declare -r SCIPY_VENV="scipy3" |
|
declare -r SCIPY_DIR="${VENVS}/${SCIPY_VENV}" |
|
|
|
|
|
echo "[INFO] This will install all binaries and libraries for the SciPy Python environment, |
|
[INFO] Cassandra and Spark on an AWS Ubuntu AMI instance. |
|
[INFO] This assumes you are running this script as described in the README documentation. |
|
" |
|
read -p "Press a key when ready or Ctrl-C to abort" |
|
|
|
sudo apt-get update && sudo apt-get -y upgrade |
|
|
|
# There are a bunch of packages that will be needed later |
|
# Install them all now: |
|
sudo apt-get install -y pkg-config gcc g++ libreadline-dev git git-sh maven |
|
|
|
# Creating a /data directory for DB files and other assorted data files |
|
# Making it also a bit more friendly (if less secure) to use |
|
sudo adduser ${USER} users |
|
|
|
# Attach EBS block device |
|
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-using-volumes.html |
|
read -p "If this is being configured on a AWS new AMI instance, you may need |
|
to attach, format and mount an external EBS disk to the instance. |
|
Do you wish to do this? [y/N] " ebs |
|
if [[ $ebs == 'y' ]]; then |
|
lsblk |
|
# |
|
# Should emit something like the below: |
|
# |
|
# NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT |
|
# xvda 202:0 0 8G 0 disk |
|
# └─xvda1 202:1 0 8G 0 part / |
|
# xvdb 202:16 0 100G 0 disk |
|
echo "[INFO] The listing above should show at least an entry of the form |
|
xvdb 202:16 0 100G 0 disk" |
|
read -p "Please enter the name of the EBS device you wish to format and mount on this instance |
|
(or enter None to skip): " disk |
|
# Verify it's an empty, unformatted device: |
|
IS_EMPTY=$(sudo file -s /dev/$disk | egrep "/dev/$disk: data") |
|
# |
|
# /dev/xvdb: data |
|
SKIP='n' |
|
if [[ -z ${IS_EMPTY} ]]; then |
|
sudo file -s /dev/$disk |
|
echo "[WARN] It seems that the device is already formatted, are you sure you want to progress." |
|
echo "[WARN] All data on the device will be lost; you've been warned!" |
|
echo "[WARN] To force re-format, please enter 'n' at the next prompt." |
|
SKIP='y' |
|
read -p "Do you want to skip this step (Ctrl-C to abort) [Y/n]? " SKIP |
|
fi |
|
# Create an ext4 filesystem on it |
|
if [[ ${SKIP} == 'n' ]]; then |
|
sudo mkfs -t ext4 /dev/$disk |
|
sudo mkdir -p /mnt/data |
|
|
|
# Add the mount point to fstab |
|
cp /etc/fstab ./ |
|
echo "/dev/$disk /mnt/data ext4 defaults,auto,nofail,nobootwait 0 0" >>fstab |
|
sudo cp ./fstab /etc/fstab |
|
sudo mount -a |
|
fi |
|
|
|
echo "[INFO] /dev/$disk should now be mounted on /mnt/data" |
|
IS_MOUNTED=$(mount |grep $disk) |
|
if [[ -z "${IS_MOUNTED}" ]]; then |
|
echo "[WARN] If seems that the secondary EBS disk is not mounted:" |
|
mount |
|
read -p "Press a key to continue or Ctrl-C to abort" |
|
fi |
|
sudo ln -snf /mnt/data /data |
|
else |
|
sudo mkdir -p /data |
|
fi |
|
if [[ ! -d "/data" ]]; then |
|
echo "[ERROR] Missing data directory (/data) please correct and run this script again" |
|
exit 1 |
|
fi |
|
sudo chown ${USER}:users /data |
|
sudo chmod 2775 /data |
|
|
|
# Setup the Python environment |
|
# |
|
# Install matplotlib (pip install matplotlib will fail) |
|
# and the Python dev package (the headers are required by numpy and friends) |
|
# Make sure the python versions match the one installed |
|
# libzmq is required by pyzmq (or its install will fail) |
|
# |
|
echo "[INFO] Installing Python 3, virtual environment support and SciPy packages" |
|
read -p "Press a key when ready or Ctrl-C to abort" |
|
|
|
sudo apt-get install -y python-setuptools python3-matplotlib python3.4-dev libzmq-dev |
|
sudo easy_install pip |
|
sudo pip install virtualenvwrapper |
|
if [[ $? != 0 ]]; then |
|
echo "[ERROR] Could not install virtualenvwrapper, please check pip logs" |
|
exit 1 |
|
fi |
|
|
|
mkdir -p "${VENVS}" |
|
|
|
# Add the virtualenvwrapper env for .bashrc |
|
# |
|
echo "export WORKON_HOME=\"${VENVS}\" |
|
source /usr/local/bin/virtualenvwrapper.sh" >> .bashrc |
|
source .bashrc |
|
|
|
# Create the virtual environment |
|
# The use of --system-site-packages is necessary to make matplotlib work in Python 3 |
|
# |
|
if [[ ! -d "${SCIPY_DIR}" ]]; then |
|
mkvirtualenv -p `which python3` --system-site-packages ${SCIPY_VENV} |
|
if [[ $? != 0 ]]; then |
|
echo "[ERROR] Could not create a virtualenv, please check the error message, if any" |
|
exit 1 |
|
fi |
|
workon ${SCIPY_VENV} |
|
fi |
|
|
|
IS_PY3=$(python --version | egrep "3\.[0-9]+") |
|
if [[ -z "${IS_PY3}" ]]; then |
|
echo "[WARN] You may not have Python 3.x enabled, double check the virtualenv" |
|
echo "[WARN] Python 3 is here: [`which python3`]; your default python is: [`which python`]" |
|
python --version |
|
fi |
|
|
|
# Add all the necessary packages (see the SparkLab-requirements.txt file) |
|
# NOTE - this will take a long time to run, with no output to stdout |
|
if [[ ! -f SparkLab-requirements.txt ]]; then |
|
echo "[ERROR] Missing SparkLab-requirements.txt file, please copy it to the ubuntu user home dir" |
|
exit 1 |
|
fi |
|
echo "[INFO] Installing SciPy packages, this will take forever: go grab a book..." |
|
read -p "Press a key when ready or Ctrl-C to abort" |
|
pip install -r SparkLab-requirements.txt |
|
if [[ $? != 0 ]]; then |
|
echo "[ERROR] Your virtual environment may miss critical packages" |
|
fi |
|
read -p "Press a key when ready or Ctrl-C to abort" |
|
|
|
## Add Java JDK 8 & Install Cassandra |
|
# See: http://tecadmin.net/install-oracle-java-8-jdk-8-ubuntu-via-ppa/ |
|
sudo add-apt-repository ppa:webupd8team/java |
|
sudo apt-get update && sudo apt-get install -y oracle-java8-installer |
|
|
|
IS_JAVA8=$(java -version 2>&1 | egrep -e "1\.8\.0_[0-9]+") |
|
if [[ -z ${IS_JAVA8} ]]; then |
|
echo "[WARN] Your installed java may not support the latest features:" |
|
java -version |
|
echo "[WARN] Make sure to fix this after setup completes. This is not critical for progress" |
|
read -p "Press a key when ready" |
|
fi |
|
|
|
## Install Cassandra |
|
# See: http://www.datastax.com/documentation/cassandra/1.2/cassandra/install/installDeb_t.html |
|
|
|
# This would FAIL (with some unreasonable permission error): |
|
# sudo echo "deb http://debian.datastax.com/community stable main" > /etc/apt/sources.list.d/cassandra.sources.list |
|
# |
|
# We need to manually edit the file and then `sudo cp` to the correct directory: |
|
echo "deb http://debian.datastax.com/community stable main" > ~/cassandra.sources.list |
|
sudo cp cassandra.sources.list /etc/apt/sources.list.d/ |
|
|
|
curl -L http://debian.datastax.com/debian/repo_key | sudo apt-key add - |
|
sudo apt-get update && sudo apt-get install -y dsc20=2.0.11-1 cassandra=2.0.11 |
|
|
|
# Cassandra configuration is fiddly and does not work well with AWS EC2 instances |
|
# |
|
# 1. you need to edit the -Xss Java option (stack size) |
|
# See: http://stackoverflow.com/questions/22470628/cassandrathe-stack-size-specified-is-too-small-specify-at-least-228k |
|
# The only solution is to manually edit the environment settings and change the following line: |
|
# |
|
# JVM_OPTS="$JVM_OPTS -Xss180k" |
|
# to |
|
# JVM_OPTS="$JVM_OPTS -Xss256k" |
|
# |
|
echo "Change the line JVM_OPTS=\"\$JVM_OPTS -Xss180k\" to a value >= 256k" |
|
read -p "Press a key when ready" |
|
sudo vim /etc/cassandra/cassandra-env.sh |
|
|
|
# 2. The host location and data directories need to be configured. |
|
# While you are at it, add the host IP address to the list of `seeds` |
|
# in /etc/cassandra/cassandra.yaml |
|
# |
|
# Look for a line that says `- seeds` and change it to something like: |
|
# |
|
# - seeds: "ip1,ip2" |
|
# |
|
# and the directories where Cassandra should store data on disk. Cassandra |
|
# will spread data evenly across them, subject to the granularity of |
|
# the configured compaction strategy. |
|
# data_file_directories: |
|
# - /data/cassandra |
|
# |
|
sudo mkdir -p /data/cassandra |
|
|
|
HOST_IP=$(ifconfig | grep -Po "(?<=inet addr:)\s*[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}" | egrep -v "127\.0\.0\.1") |
|
echo "Change the 'data_file_directories' to point to /data/cassandra," |
|
echo "the 'seeds' and 'listen_address' to point to $HOST_IP, at the very least" |
|
# |
|
# These are the manual changes necessary to make this all work: |
|
# seed_provider: |
|
# - seeds: "ip1,ip2" |
|
# rpc_address: 0.0.0.0 |
|
# listen_address: ip1 |
|
# data_file_directories: |
|
# - /data/cassandra |
|
# |
|
read -p "Press a key when ready" |
|
sudo vim /etc/cassandra/cassandra.yaml |
|
|
|
# 3. In a VPC with no public nodes, your hostname won't resolve and Cassandra |
|
# will terminate the node |
|
# |
|
cp /etc/hosts ./ |
|
echo "$HOST_IP `hostname`" >> hosts |
|
sudo cp ./hosts /etc/ |
|
echo "[INFO] Please verify that the last line added to your /etc/hosts file is correct" |
|
cat /etc/hosts |
|
read -p "Press a key when ready" |
|
|
|
# The above MAY fail; the alternative is to change the |
|
# listen_address: ip1 |
|
# to whatever the IP address of the host is. |
|
# Verify that you can run Cassandra: the line below starts it in foreground mode (so you can see |
|
# the logs); then open another shell and run `cqlsh` to start a cassandra shell. |
|
# You will have to terminate this session via Ctrl-C |
|
# sudo cassandra -f |
|
sudo chown -R cassandra: /var/{lib,log}/cassandra /data/cassandra |
|
sudo service cassandra start |
|
sudo service cassandra status |
|
echo "[INFO] Cassandra installation complete, please verify that you can connect |
|
to the server using: cqlsh" |
|
read -p "Press a key when ready" |
|
|
|
## Install Apache Spark |
|
# See: http://spark.apache.org/docs/1.2.0/spark-standalone.html |
|
# |
|
echo "[INFO] Installing Apache Spark" |
|
read -p "Press a key when ready" |
|
|
|
# |
|
# Ubuntu repositories only have Scala 2.9 - we need to get and install 2.11 manually |
|
# |
|
wget http://downloads.typesafe.com/scala/2.11.5/scala-2.11.5.deb |
|
sudo dpkg -i scala-2.11.5.deb |
|
rm scala* |
|
IS_SCALA=$(scalac -version | grep "2\.11") |
|
if [[ -z ${IS_SCALA} ]]; then |
|
echo "[ERROR] Scala 2.11 did not get properly installed, the installed (if any) scalac is: $(scalac -version)" |
|
read -p "Hit Ctrl-C to quit now or any key to continue " |
|
fi |
|
|
|
wget http://apache.petsads.us/spark/spark-1.2.0/spark-1.2.0-bin-hadoop2.4.tgz |
|
tar xfz spark-1.2.0-bin-hadoop2.4.tgz |
|
sudo mv spark-1.2.0-bin-hadoop2.4 /usr/local/ |
|
sudo ln -snf spark-1.2.0-bin-hadoop2.4 /usr/local/spark |
|
echo "export SPARK_HOME=/usr/local/spark |
|
export PATH=\${PATH}:\${SPARK_HOME}/sbin:\${SPARK_HOME}/bin |
|
" >> .bashrc |
|
|
|
echo "[INFO] Apache Spark installed at ${SPARK_HOME}" |
|
echo "[INFO] To start Spark Master, run: sudo start-master.sh" |
|
|
|
# Adding some useful aliases |
|
echo "alias mkvenv3='mkvirtualenv -p `which python3` --system-site-packages' |
|
alias upgrade='sudo apt-get update && sudo apt-get -y upgrade' |
|
alias notebook='ipython notebook --profile=nbserver --pylab=inline'" >>.bash_aliases |
|
source .bashrc |
|
|
|
# Finally create a "durable" development environment |
|
sudo mkdir -p /data/dev |
|
sudo chown -R :users /data/dev |
|
sudo chmod -R 2775 /data/dev |
|
|
|
echo "[SUCCESS] Done." |