Skip to content

Instantly share code, notes, and snippets.

@GEOFBOT
Last active July 17, 2016 20:06
Show Gist options
  • Save GEOFBOT/3ffc9b21214174ae750cc3fdb2625b71 to your computer and use it in GitHub Desktop.
Save GEOFBOT/3ffc9b21214174ae750cc3fdb2625b71 to your computer and use it in GitHub Desktop.
Guide to setting up a BlueData CentOS 6.7 / AWS Ubuntu 14.04 cluster for running Flink jobs
# run setup script on each worker node
cd ~
wget https://gist.github.com/GEOFBOT/3ffc9b21214174ae750cc3fdb2625b71/raw/slaves
for ip in (cat slaves)
ssh bluedata@$ip "curl -L https://gist.github.com/GEOFBOT/3ffc9b21214174ae750cc3fdb2625b71/raw/quicksetup.sh | sh" &
end
rm slaves

Make sure to allow all traffic between nodes in the cluster security group so data can be sent between nodes. Set up SSH key on the master node. Use only AWS internal IPs when setting up list of worker nodes (slaves).

#!/bin/bash
# Convert CDH BlueData node to CentOS so we can have nice packages
yum clean all
mkdir ~/centos; cd ~/centos
wget http://mirror.centos.org/centos/6.8/os/x86_64/RPM-GPG-KEY-CentOS-6
wget http://mirror.centos.org/centos/6.8/os/x86_64/Packages/centos-release-6-8.el6.centos.12.3.x86_64.rpm
wget http://mirror.centos.org/centos/6.8/os/x86_64/Packages/yum-3.2.29-73.el6.centos.noarch.rpm
wget http://mirror.centos.org/centos/6.8/os/x86_64/Packages/yum-utils-1.1.30-37.el6.noarch.rpm
wget http://mirror.centos.org/centos/6.8/os/x86_64/Packages/yum-plugin-fastestmirror-1.1.30-37.el6.noarch.rpm
rpm --import RPM-GPG-KEY-CentOS-6
rpm -e --nodeps redhat-release-server
rpm -Uhv --force --nodeps *.rpm
yum upgrade -y
yum remove subscription-manager
cd ~
rm -r centos
172.17.77.20:8081
#!/bin/sh
### Setup script that automates commands listed in the Markdown file
### ON EACH MASTER AND WORKER NODE:
JDK_VER_MAJ=8
JDK_VER_MIN=92
## Enable CentOS repositories if this node is still RedHat
if rpm -q redhat-release-server; then
curl -L https://gist.github.com/geofbot/3ffc9b21214174ae750cc3fdb2625b71/raw/convert_to_CentOS.sh | sudo bash
fi
## Install git and bzip2
sudo yum install git bzip2 -y
## Install JDK 8 and Anaconda Python
if [ ! -d "/usr/java/jdk1.${JDK_VER_MAJ}.0_${JDK_VER_MIN}" ]; then
cd ~
wget --no-cookies --no-check-certificate --header "Cookie: gpw_e24=http%3A%2F%2Fwww.oracle.com%2F; oraclelicense=accept-securebackup-cookie" "http://download.oracle.com/otn-pub/java/jdk/8u92-b14/jdk-${JDK_VER_MAJ}u${JDK_VER_MIN}-linux-x64.rpm"
sudo yum localinstall jdk-${JDK_VER_MAJ}u${JDK_VER_MIN}-linux-x64.rpm -y
rm ~/jdk-${JDK_VER_MAJ}u${JDK_VER_MIN}-linux-x64.rpm
sudo alternatives --set java /usr/java/jdk1.${JDK_VER_MAJ}.0_${JDK_VER_MIN}/jre/bin/java
fi
if ! which python3; then
wget http://repo.continuum.io/archive/Anaconda3-4.1.1-Linux-x86_64.sh
sudo bash Anaconda3-4.1.1-Linux-x86_64.sh -b -p /opt/anaconda3
echo 'export PATH=/opt/anaconda3/bin:$PATH' >> ~/.bashrc
source ~/.bashrc
rm Anaconda3-4.1.1-Linux-x86_64.sh
fi
## Setup Apache Maven
if [ ! -d "/opt/maven" ]; then
cd ~
wget http://apache.osuosl.org/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.zip
unzip apache-maven-3.3.9-bin.zip
rm ~/apache-maven-3.3.9-bin.zip
sudo mv ~/apache-maven-3.3.9/ /opt/maven
sudo ln -s /opt/maven/bin/mvn /usr/bin/mvn
fi
## Build Apache Flink
# git clone https://github.com/GEOFBOT/flink flink-src
# cd flink-src
# # BlueData uses CDH 5.4.3 which has Hadoop 2.3.0+. We don't need to specify Hadoop version
# # because Flink uses Hadoop 2.3.0+ by default anyways.
# mvn clean install -DskipTests
# ln -s ~/flink-src/build-target ~/flink
cd ~
if [ ! -d "flink" ]; then
wget https://github.com/GEOFBOT/flink/releases/download/iteration/flink-bulkiterations.tgz
tar xzvf flink-bulkiterations.tgz
rm flink-bulkiterations.tgz
fi
## Set up Flink config files
mkdir -p ~/flink/tmp
cd ~/flink/conf/
# Modified configuration file
wget -O flink-conf.yaml https://gist.github.com/geofbot/3ffc9b21214174ae750cc3fdb2625b71/raw/flink-conf.yaml
echo >> flink-conf.yaml # No trailing newline from wget?
# List of worker node IPs
wget -O masters https://gist.github.com/geofbot/3ffc9b21214174ae750cc3fdb2625b71/raw/masters
echo >> masters
wget -O slaves https://gist.github.com/geofbot/3ffc9b21214174ae750cc3fdb2625b71/raw/slaves
echo >> slaves
## Repeat process for each worker
echo Repeat this process on the master node and on each worker node.
## Set up private keys
echo Remember to set up .ssh/config on the master node so that it can control the workers.
## Start Flink on the master node
# ~/flink/bin/start-cluster.sh
172.17.77.21
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment