GEOFBOT/HDFS on AWS.md

## flink-conf.yaml
################################################################################
#  Licensed to the Apache Software Foundation (ASF) under one
#  or more contributor license agreements.  See the NOTICE file
#  distributed with this work for additional information
#  regarding copyright ownership.  The ASF licenses this file
#  to you under the Apache License, Version 2.0 (the
#  "License"); you may not use this file except in compliance
#  with the License.  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
# limitations under the License.
################################################################################


#==============================================================================
# Common
#==============================================================================

# The host on which the JobManager runs. Only used in non-high-availability mode.
# The JobManager process will use this hostname to bind the listening servers to.
# The TaskManagers will try to connect to the JobManager on that host.

jobmanager.rpc.address: 172.31.10.155


# The port where the JobManager's main actor system listens for messages.

jobmanager.rpc.port: 6123


# The heap size for the JobManager JVM

jobmanager.heap.mb: 256


# The heap size for the TaskManager JVM

taskmanager.heap.mb: 512


# The number of task slots that each TaskManager offers. Each slot runs one parallel pipeline.

taskmanager.numberOfTaskSlots: 2

# Specify whether TaskManager memory should be allocated when starting up (true) or when
# memory is required in the memory manager (false)

taskmanager.memory.preallocate: false

# The parallelism used for programs that did not specify and other parallelism.

parallelism.default: 1


#==============================================================================
# Web Frontend
#==============================================================================

# The port under which the web-based runtime monitor listens.
# A value of -1 deactivates the web server.

jobmanager.web.port: 8081

# Flag to specify whether job submission is enabled from the web-based
# runtime monitor. Uncomment to disable.

jobmanager.web.submit.enable: false


#==============================================================================
# Streaming state checkpointing
#==============================================================================

# The backend that will be used to store operator state checkpoints if
# checkpointing is enabled.
#
# Supported backends: jobmanager, filesystem, <class-name-of-factory>
#
#state.backend: filesystem


# Directory for storing checkpoints in a Flink-supported filesystem
# Note: State backend must be accessible from the JobManager and all TaskManagers.
# Use "hdfs://" for HDFS setups, "file://" for UNIX/POSIX-compliant file systems,
# (or any local file system under Windows), or "S3://" for S3 file system.
#
# state.backend.fs.checkpointdir: hdfs://namenode-host:port/flink-checkpoints


#==============================================================================
# Advanced
#==============================================================================

# The number of buffers for the network stack.
#
# taskmanager.network.numberOfBuffers: 2048


# Directories for temporary files.
#
# Add a delimited list for multiple directories, using the system directory
# delimiter (colon ':' on unix) or a comma, e.g.:
#     /data1/tmp:/data2/tmp:/data3/tmp
#
# Note: Each directory entry is read from and written to by a different I/O
# thread. You can include the same directory multiple times in order to create
# multiple I/O threads against that directory. This is for example relevant for
# high-throughput RAIDs.
#
# If not specified, the system-specific Java temporary directory (java.io.tmpdir
# property) is taken.
#
# taskmanager.tmp.dirs: /tmp


# Path to the Hadoop configuration directory.
#
# This configuration is used when writing into HDFS. Unless specified otherwise,
# HDFS file creation will use HDFS default settings with respect to block-size,
# replication factor, etc.
#
# You can also directly specify the paths to hdfs-default.xml and hdfs-site.xml
# via keys 'fs.hdfs.hdfsdefault' and 'fs.hdfs.hdfssite'.
#
fs.hdfs.hadoopconf: /home/ec2-user/hadoop-2.7.2/etc/hadoop


#==============================================================================
# Master High Availability (required configuration)
#==============================================================================

# The list of ZooKepper quorum peers that coordinate the high-availability
# setup. This must be a list of the form:
# "host1:clientPort,host2[:clientPort],..." (default clientPort: 2181)
#
# recovery.mode: zookeeper
#
# recovery.zookeeper.quorum: localhost:2181,...
#
# Note: You need to set the state backend to 'filesystem' and the checkpoint
# directory (see above) before configuring the storageDir.
#
# recovery.zookeeper.storageDir: hdfs:///recovery

## flink_setup.sh
#!/bin/bash

sudo yum install java-1.8.0-openjdk-devel wget unzip git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc

if ! which python3; then
  wget http://repo.continuum.io/archive/Anaconda3-4.1.1-Linux-x86_64.sh
  sudo bash Anaconda3-4.1.1-Linux-x86_64.sh -b -p /opt/anaconda3
  echo 'export PATH=/opt/anaconda3/bin:$PATH' >> ~/.bashrc
  source ~/.bashrc
  rm Anaconda3-4.1.1-Linux-x86_64.sh
fi

## Setup Apache Maven

if [ ! -d "/opt/maven" ]; then
  cd ~
  wget http://apache.osuosl.org/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.zip
  unzip apache-maven-3.3.9-bin.zip
  rm ~/apache-maven-3.3.9-bin.zip
  sudo mv ~/apache-maven-3.3.9/ /opt/maven
  sudo ln -s /opt/maven/bin/mvn /usr/local/bin/mvn
fi

cd ~
if [ ! -d "flink" ]; then
  wget https://github.com/GEOFBOT/flink/releases/download/iteration/flink-bulk-iterations.2.tgz
  tar xzvf flink-bulk-iterations.2.tgz
  rm flink-bulk-iterations.2.tgz
fi

cd ~
if [ ! -d "hadoop-2.7.2" ]; then
  wget http://www.gtlib.gatech.edu/pub/apache/hadoop/core/hadoop-2.7.2/hadoop-2.7.2.tar.gz
  tar zxf hadoop-2.7.2.tar.gz
  rm hadoop-2.7.2.tar.gz
fi

cd ~/flink/conf/
# Modified configuration file
wget -O flink-conf.yaml https://gist.github.com/geofbot/8f598525370c9d0e9a4767f640bb512b/raw/flink-conf.yaml
# List of worker node IPs
wget -O slaves https://gist.github.com/geofbot/8f598525370c9d0e9a4767f640bb512b/raw/slaves

## HDFS on AWS.md

      
    Raw
  

              HDFS on AWS.md
            
          
    On each node:
Set up packages and install Hadoop:
#!/bin/bash

sudo yum install java-1.8.0-openjdk-devel wget git bzip2 -y
echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
source ~/.bashrc

cd ~
wget http://www.gtlib.gatech.edu/pub/apache/hadoop/core/hadoop-2.7.2/hadoop-2.7.2.tar.gz
tar zxf hadoop-2.7.2.tar.gz
rm hadoop-2.7.2.tar.gz

On master node:
Set up hadoop-2.7.2/etc/hadoop/ conf files:
core-site.xml:
<configuration>
  <property>
    <name>fs.defaultFS</name>
    <value>hdfs://172.31.14.106:9000/</value>
  </property>
</configuration>

hdfs-site.xml:
<configuration>
  <property>
    <name>dfs.replication</name>
    <!-- http://docs.aws.amazon.com/ElasticMapReduce/latest/DeveloperGuide/HDFSConfig_H2.html -->
    <value>1</value>  <!-- per AWS recommendations (sort of) -->
  </property>
</configuration>

slaves:
172.31.46.51
172.31.46.52
172.31.46.53
172.31.46.54

Set up SSH keys on master node (~/.ssh/config):
Host *
    IdentityFile ~/.ssh/key.pem

Copy config to worker machines:
while read slaves; do scp * ${slaves}:~/hadoop-2.7.2/etc/hadoop/; done < slaves

Format and start HDFS:
bin/hdfs namenode -format
sbin/start-dfs.sh


## Setup Flink.md

      
    Raw
  

              Setup Flink.md
            
          
    Run flink_setup.sh on each node:
for ip in `curl -L https://gist.githubusercontent.com/GEOFBOT/8f598525370c9d0e9a4767f640bb512b/raw/slaves`; do ssh ${ip} "curl -L https://gist.github.com/GEOFBOT/8f598525370c9d0e9a4767f640bb512b/raw/flink_setup.sh | bash"; done
Copy hadoop conf files from hadoop master node to flink nodes:
for ip in `curl -L https://gist.githubusercontent.com/GEOFBOT/8f598525370c9d0e9a4767f640bb512b/raw/slaves`; do scp -r ~/hadoop-2.7.2/etc/hadoop ${ip}:~/hadoop-2.7.2/etc/hadoop; done

  
## slaves
172.31.10.155
172.31.43.154
172.31.43.155
172.31.43.156
172.31.43.157
	################################################################################
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	################################################################################


	#==============================================================================
	# Common
	#==============================================================================

	# The host on which the JobManager runs. Only used in non-high-availability mode.
	# The JobManager process will use this hostname to bind the listening servers to.
	# The TaskManagers will try to connect to the JobManager on that host.

	jobmanager.rpc.address: 172.31.10.155


	# The port where the JobManager's main actor system listens for messages.

	jobmanager.rpc.port: 6123


	# The heap size for the JobManager JVM

	jobmanager.heap.mb: 256


	# The heap size for the TaskManager JVM

	taskmanager.heap.mb: 512


	# The number of task slots that each TaskManager offers. Each slot runs one parallel pipeline.

	taskmanager.numberOfTaskSlots: 2

	# Specify whether TaskManager memory should be allocated when starting up (true) or when
	# memory is required in the memory manager (false)

	taskmanager.memory.preallocate: false

	# The parallelism used for programs that did not specify and other parallelism.

	parallelism.default: 1


	#==============================================================================
	# Web Frontend
	#==============================================================================

	# The port under which the web-based runtime monitor listens.
	# A value of -1 deactivates the web server.

	jobmanager.web.port: 8081

	# Flag to specify whether job submission is enabled from the web-based
	# runtime monitor. Uncomment to disable.

	jobmanager.web.submit.enable: false


	#==============================================================================
	# Streaming state checkpointing
	#==============================================================================

	# The backend that will be used to store operator state checkpoints if
	# checkpointing is enabled.
	#
	# Supported backends: jobmanager, filesystem, <class-name-of-factory>
	#
	#state.backend: filesystem


	# Directory for storing checkpoints in a Flink-supported filesystem
	# Note: State backend must be accessible from the JobManager and all TaskManagers.
	# Use "hdfs://" for HDFS setups, "file://" for UNIX/POSIX-compliant file systems,
	# (or any local file system under Windows), or "S3://" for S3 file system.
	#
	# state.backend.fs.checkpointdir: hdfs://namenode-host:port/flink-checkpoints


	#==============================================================================
	# Advanced
	#==============================================================================

	# The number of buffers for the network stack.
	#
	# taskmanager.network.numberOfBuffers: 2048


	# Directories for temporary files.
	#
	# Add a delimited list for multiple directories, using the system directory
	# delimiter (colon ':' on unix) or a comma, e.g.:
	# /data1/tmp:/data2/tmp:/data3/tmp
	#
	# Note: Each directory entry is read from and written to by a different I/O
	# thread. You can include the same directory multiple times in order to create
	# multiple I/O threads against that directory. This is for example relevant for
	# high-throughput RAIDs.
	#
	# If not specified, the system-specific Java temporary directory (java.io.tmpdir
	# property) is taken.
	#
	# taskmanager.tmp.dirs: /tmp


	# Path to the Hadoop configuration directory.
	#
	# This configuration is used when writing into HDFS. Unless specified otherwise,
	# HDFS file creation will use HDFS default settings with respect to block-size,
	# replication factor, etc.
	#
	# You can also directly specify the paths to hdfs-default.xml and hdfs-site.xml
	# via keys 'fs.hdfs.hdfsdefault' and 'fs.hdfs.hdfssite'.
	#
	fs.hdfs.hadoopconf: /home/ec2-user/hadoop-2.7.2/etc/hadoop


	#==============================================================================
	# Master High Availability (required configuration)
	#==============================================================================

	# The list of ZooKepper quorum peers that coordinate the high-availability
	# setup. This must be a list of the form:
	# "host1:clientPort,host2[:clientPort],..." (default clientPort: 2181)
	#
	# recovery.mode: zookeeper
	#
	# recovery.zookeeper.quorum: localhost:2181,...
	#
	# Note: You need to set the state backend to 'filesystem' and the checkpoint
	# directory (see above) before configuring the storageDir.
	#
	# recovery.zookeeper.storageDir: hdfs:///recovery
	#!/bin/bash

	sudo yum install java-1.8.0-openjdk-devel wget unzip git bzip2 -y
	echo export JAVA_HOME=/usr/lib/jvm/java >> ~/.bashrc
	source ~/.bashrc

	if ! which python3; then
	wget http://repo.continuum.io/archive/Anaconda3-4.1.1-Linux-x86_64.sh
	sudo bash Anaconda3-4.1.1-Linux-x86_64.sh -b -p /opt/anaconda3
	echo 'export PATH=/opt/anaconda3/bin:$PATH' >> ~/.bashrc
	source ~/.bashrc
	rm Anaconda3-4.1.1-Linux-x86_64.sh
	fi

	## Setup Apache Maven

	if [ ! -d "/opt/maven" ]; then
	cd ~
	wget http://apache.osuosl.org/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.zip
	unzip apache-maven-3.3.9-bin.zip
	rm ~/apache-maven-3.3.9-bin.zip
	sudo mv ~/apache-maven-3.3.9/ /opt/maven
	sudo ln -s /opt/maven/bin/mvn /usr/local/bin/mvn
	fi

	cd ~
	if [ ! -d "flink" ]; then
	wget https://github.com/GEOFBOT/flink/releases/download/iteration/flink-bulk-iterations.2.tgz
	tar xzvf flink-bulk-iterations.2.tgz
	rm flink-bulk-iterations.2.tgz
	fi

	cd ~
	if [ ! -d "hadoop-2.7.2" ]; then
	wget http://www.gtlib.gatech.edu/pub/apache/hadoop/core/hadoop-2.7.2/hadoop-2.7.2.tar.gz
	tar zxf hadoop-2.7.2.tar.gz
	rm hadoop-2.7.2.tar.gz
	fi

	cd ~/flink/conf/
	# Modified configuration file
	wget -O flink-conf.yaml https://gist.github.com/geofbot/8f598525370c9d0e9a4767f640bb512b/raw/flink-conf.yaml
	# List of worker node IPs
	wget -O slaves https://gist.github.com/geofbot/8f598525370c9d0e9a4767f640bb512b/raw/slaves
	172.31.10.155
	172.31.43.154
	172.31.43.155
	172.31.43.156
	172.31.43.157