rohithreddy/hdfs-mesos.md

## hdfs-mesos.md

      
    Raw
  

              hdfs-mesos.md
            
          
    Setup Mesos-DNS

Scripts for setting up

sudo mkdir /etc/mesos-dns
sudo vi /etc/mesos-dns/config.json

config.json

# replace zk:/10.16.67.152 with IP of master
{
  "zk": "zk://10.16.67.153:2181/mesos",
  "refreshSeconds": 60,
  "ttl": 60,
  "domain": "mesos",
  "port": 53,
  "resolvers": ["169.254.169.254","10.0.0.1"],
  "timeout": 5,
  "email": "root.mesos-dns.mesos"
}

sudo docker pull mesosphere/mesos-dns
sudo docker run --net=host -d -v "/etc/mesos-dns/config.json:/config.json" mesosphere/mesos-dns /mesos-dns -config=/config.json
OR
docker run -d --name mesos-dns -p 53:53/udp -v /etc/mesos-dns/config.json:/config.json mesosphere/mesos-dns /mesos-dns -v 2 -config=config.json


sudo sed -i "1s/^/nameserver $(hostname -i)\n/" /etc/resolv.conf
sudo sed -i "1s/^/prepend domain-name-servers $(hostname -i);\n/" /etc/dhcp/dhclient.conf

###testing
sudo docker run --net=host tutum/dnsutils dig google.com
sudo docker run --net=host tutum/dnsutils dig master.mesos

Build HDFS

clone the project
git clone https://github.com/mesosphere/hdfs
There are a couple of ways of making the configuration changes we need.
copy all of the XML files from hdfs/example-conf/mesosphere-dcos/ to hdfs/conf
modify conf/mesos-site.xml

set mesos.hdfs.native-hadoop-binaries to false
set mesos.native.library to /usr/local/lib/libmesos.so

example:
<property>
    <name>mesos.hdfs.native-hadoop-binaries</name>
    <value>false</value>
    <description>True if you have pre installed hadoop binairies</description>
  </property>
    <property>
    <name>mesos.native.library</name>
    <value>/usr/local/lib/libmesos.so</value>
  </property>

from hdfs directory, build
./bin/build-hdfs
scp the tarball to the master
scp hdfs-mesos-0.1.1.tgz root@$MESOS_MASTER:~
ssh to the master
ssh root@$MESOS_MASTER
untar hdfs
tar zxvf hdfs-mesos-*.tgz
start HDFS
cd hdfs-mesos-0.1.1
./bin/hdfs-mesos
###Purge pre installed hadoop binaries on the slaves and the master if necessary
sudo aptitude purge hadoop hadoop-yarn hadoop-hdfs hadoop-hdfs-namenode hadoop-hdfs-datanode hadoop-0.20-mapreduce hadoop-0.20-mapreduce-jobtracker hadoop-0.20-mapreduce-tasktracker hadoop-mapreduce
###Delete the hadoop directories on the slaves
sudo rm -rf /etc/hadoop /mnt/hdfs /var/lib/hadoop* var/log/hadoop
###Check at the http://$(actinamenode_ip):50070 to see if it's running
try hadoop fs -ls hdfs://hdfs//
hadoop fs -put file.txt hdfs://hdfs
Load in some data.
hadoop
Run spark jobs.
Spark terasort

https://github.com/ehiggs/spark-terasort/
###Testing for Failover
###Run DFSIO First to test great I/O : (Needs Map reduce installed)
###Run HiBench : https://github.com/intel-hadoop/HiBench
###Push it further
####Constraint with hdfs ...
####Resources Reservation for the hdfs framework
####Constraint with Spark ...
####Launch both (Spark + HDFS) with in the same rack. Use marathon here. Makes more sense.
####Launch TPC-H via SparkSQL + TPCDS Impala https://github.com/cloudera/impala-tpcds-kit

  
## marathon-launcher.sh

#!/bin/bash

if ["$#" -ne 1]; then
      echo "cript needs a json file as argument"
      exit 1;
fi

curl -X POST -H "Content-Type: application/json" 10.20.30.35:8080/v2/apps -d@"$@"

#http POST mesos.master:8080/v2/apps < @"$@"

## spark-dispatcher.json
{
  "id": "spark-dispatcher",
  "cpus": 2,
  "mem": 2048,
  "instances": 1,
  "cmd": "mv /mnt/mesos/sandbox/log4j.properties conf/log4j.properties && ./bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher --port $PORT0 --webui-port $PORT1 --master mesos://10.20.30.7:5050 --zk 10.20.30.7:2181 --host $HOST --name spark",
  "uris": [
    "http://downloads.mesosphere.com.s3.amazonaws.com/assets/spark/log4j.properties"
  ],

  "ports": [
	 0
	,0
    ],

  "container":{
    "type" : "DOCKER",
    "docker": {
      "image" : "mesosphere/spark:1.5.0-rc2-hdfs", ## update to released 1.5
      "network":"HOST",
    }
  },

  "healthchecks" : [
    {
      "path":"/",
      "portIndex" : 1,
      "protocol" : "HTTP",
      "gracePeriodSeconds" : 5,
      "intervalSeconds" : 60,
      "timeoutSeconds" : 10,
      "maxConsecutivesFailures":3
    }
  ]
}

## spark-marathon.json
{
  "id": "spark-mesos-dispatcher",
  "cpus": 2,
  "mem": 2048,
  "instances": 1,
  "cmd": "mv /mnt/mesos/sandbox/log4j.properties conf/log4j.properties &&  ./bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher --port $PORT0 --webui-port $PORT1 --master mesos://zk://master.mesos:2181/mesos --zk master.mesos:2181 --host $HOST --name spark",
  "uris": [
    "http://downloads.mesosphere.com.s3.amazonaws.com/assets/spark/log4j.properties"
  ],

  "ports": [
    0,
    0
  ],

  "container":{
    "type" : "DOCKER",
    "docker": {
      "image" : "mesosphere/spark:1.5.0-rc2-hdfs",
      "network":"HOST",
    }
  },

  "healthchecks" : [
    {
      "path":"/",
      "portIndex" : 1,
      "protocol" : "HTTP",
      "gracePeriodSeconds" : 5,
      "intervalSeconds" : 60,
      "timeoutSeconds" : 10,
      "maxConsecutivesFailures":3
    }
  ]
}

## spark-notebook.json

{
    "id": "spark-notebook",
    "cpus": .5,
    "mem": 3500,
    "instances": 1,

    "container":{
      "type" : "DOCKER",
      "docker": {
        "image" : "andypetrella/spark-notebook:0.6.1-scala-2.10.4-spark-1.5.0-hadoop-2.6.0-cdh5.4.4-with-hive-with-parquet",
        "network":"HOST",
        "privileged":true
      }
    },

    "healthChecks": [
      {
        "protocol": "HTTP",
        "portIndex": 0,
        "path": "/",
        "gracePeriodSeconds": 5,
        "intervalSeconds": 20,
        "maxConsecutiveFailures": 3
      }
    ],

  "ports": [0,0]
}

	#!/bin/bash

	if ["$#" -ne 1]; then
	echo "cript needs a json file as argument"
	exit 1;
	fi

	curl -X POST -H "Content-Type: application/json" 10.20.30.35:8080/v2/apps -d@"$@"

	#http POST mesos.master:8080/v2/apps < @"$@"
	{
	"id": "spark-dispatcher",
	"cpus": 2,
	"mem": 2048,
	"instances": 1,
	"cmd": "mv /mnt/mesos/sandbox/log4j.properties conf/log4j.properties && ./bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher --port $PORT0 --webui-port $PORT1 --master mesos://10.20.30.7:5050 --zk 10.20.30.7:2181 --host $HOST --name spark",
	"uris": [
	"http://downloads.mesosphere.com.s3.amazonaws.com/assets/spark/log4j.properties"
	],

	"ports": [
	0
	,0
	],

	"container":{
	"type" : "DOCKER",
	"docker": {
	"image" : "mesosphere/spark:1.5.0-rc2-hdfs", ## update to released 1.5
	"network":"HOST",
	}
	},

	"healthchecks" : [
	{
	"path":"/",
	"portIndex" : 1,
	"protocol" : "HTTP",
	"gracePeriodSeconds" : 5,
	"intervalSeconds" : 60,
	"timeoutSeconds" : 10,
	"maxConsecutivesFailures":3
	}
	]
	}
	{
	"id": "spark-mesos-dispatcher",
	"cpus": 2,
	"mem": 2048,
	"instances": 1,
	"cmd": "mv /mnt/mesos/sandbox/log4j.properties conf/log4j.properties && ./bin/spark-class org.apache.spark.deploy.mesos.MesosClusterDispatcher --port $PORT0 --webui-port $PORT1 --master mesos://zk://master.mesos:2181/mesos --zk master.mesos:2181 --host $HOST --name spark",
	"uris": [
	"http://downloads.mesosphere.com.s3.amazonaws.com/assets/spark/log4j.properties"
	],

	"ports": [
	0,
	0
	],

	"container":{
	"type" : "DOCKER",
	"docker": {
	"image" : "mesosphere/spark:1.5.0-rc2-hdfs",
	"network":"HOST",
	}
	},

	"healthchecks" : [
	{
	"path":"/",
	"portIndex" : 1,
	"protocol" : "HTTP",
	"gracePeriodSeconds" : 5,
	"intervalSeconds" : 60,
	"timeoutSeconds" : 10,
	"maxConsecutivesFailures":3
	}
	]
	}

	{
	"id": "spark-notebook",
	"cpus": .5,
	"mem": 3500,
	"instances": 1,

	"container":{
	"type" : "DOCKER",
	"docker": {
	"image" : "andypetrella/spark-notebook:0.6.1-scala-2.10.4-spark-1.5.0-hadoop-2.6.0-cdh5.4.4-with-hive-with-parquet",
	"network":"HOST",
	"privileged":true
	}
	},

	"healthChecks": [
	{
	"protocol": "HTTP",
	"portIndex": 0,
	"path": "/",
	"gracePeriodSeconds": 5,
	"intervalSeconds": 20,
	"maxConsecutiveFailures": 3
	}
	],

	"ports": [0,0]
	}