Tanzir Musabbir tmusabbir

## livy-example.sh
# This is the usual sample spark-submit command to submit the SparkPi sample application
spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/examples/jars/spark-examples.jar

# Now submit the same job from EMR master node (assume the jar file is in test folder):
curl -X POST --data '{"file": "/test/spark-examples.jar", "className": "org.apache.spark.examples.SparkPi"}' -H "Content-Type: application/json" localhost:8998/batches

# Previous example is pointing to localhost as it submitted job from the same host, now submitting job from remote location:
curl -X POST --data '{"file": "/test/spark-examples.jar", "className": "org.apache.spark.examples.SparkPi"}' -H "Content-Type: application/json" <<your-emr-master-dns>>:8998/batches

# Now assume the jar file is in S3 location, in that case, you can follow this:

## capacity-scheduler.json
{
        Classification: "capacity-scheduler",
        Properties: {
          "yarn.scheduler.capacity.root.queues": "default,dev,qa",
          "yarn.scheduler.capacity.root.default.capacity": "20",
          "yarn.scheduler.capacity.root.default.maximum-capacity": "50",
          "yarn.scheduler.capacity.root.dev.capacity": "40",
          "yarn.scheduler.capacity.root.dev.maximum-capacity": "100",
          "yarn.scheduler.capacity.root.qa.capacity": "40",
          "yarn.scheduler.capacity.root.qa.maximum-capacity": "80"

## create-spark-cluster.sh
aws emr create-cluster --auto-scaling-role EMR_AutoScaling_DefaultRole --termination-protected --applications Name=Hadoop Name=Hive Name=Spark --ebs-root-volume-size 10 --ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-xxxx","EmrManagedSlaveSecurityGroup":"sg-xxxxx","EmrManagedMasterSecurityGroup":"sg-xxxxx"}' --service-role EMR_DefaultRole --enable-debugging --release-label emr-5.12.0 --log-uri 's3n://aws-logs-xxxx/elasticmapreduce/' --name 'spark-cluster' --instance-groups '[{"InstanceCount":2,"BidPrice":"0.30","AutoScalingPolicy":{"Constraints":{"MinCapacity":0,"MaxCapacity":20},"Rules":[{"Action":{"SimpleScalingPolicyConfiguration":{"ScalingAdjustment":2,"CoolDown":300,"AdjustmentType":"CHANGE_IN_CAPACITY"}},"Description":"","Trigger":{"CloudWatchAlarmDefinition":{"MetricName":"YARNMemoryAvailablePercentage","ComparisonOperator":"LESS_THAN","Statistic":"AVERAGE","Period":300,"Dimensions":[{"Value":"${emr.clusterId}","Key":"JobFlowId"}],"EvaluationPeriods":1,"Unit":"PERCENT","Na

## Hive archive with Oozie
Hive Archiving/Maintenance with the help of Oozie

## a#Cassandra Performance Tuning
Cassandra Performance Tuning

## a#Cassandra Stress Test
Cassandra Stress Test

## a:Install Kafka in CentOS
Install Kafka in CentOS

## a:Install Replicated ZooKeeper in CentOS
Install Replicated ZooKeeper in CentOS

## a:Install Opscenter in CentOS environment
Install Opscenter in CentOS environment

## a:Setup a Storm cluster on Amazon EC2
Setup a Storm cluster on Amazon EC2
	# This is the usual sample spark-submit command to submit the SparkPi sample application
	spark-submit --class org.apache.spark.examples.SparkPi /usr/lib/spark/examples/jars/spark-examples.jar

	# Now submit the same job from EMR master node (assume the jar file is in test folder):
	curl -X POST --data '{"file": "/test/spark-examples.jar", "className": "org.apache.spark.examples.SparkPi"}' -H "Content-Type: application/json" localhost:8998/batches

	# Previous example is pointing to localhost as it submitted job from the same host, now submitting job from remote location:
	curl -X POST --data '{"file": "/test/spark-examples.jar", "className": "org.apache.spark.examples.SparkPi"}' -H "Content-Type: application/json" <<your-emr-master-dns>>:8998/batches

	# Now assume the jar file is in S3 location, in that case, you can follow this:
	{
	Classification: "capacity-scheduler",
	Properties: {
	"yarn.scheduler.capacity.root.queues": "default,dev,qa",
	"yarn.scheduler.capacity.root.default.capacity": "20",
	"yarn.scheduler.capacity.root.default.maximum-capacity": "50",
	"yarn.scheduler.capacity.root.dev.capacity": "40",
	"yarn.scheduler.capacity.root.dev.maximum-capacity": "100",
	"yarn.scheduler.capacity.root.qa.capacity": "40",
	"yarn.scheduler.capacity.root.qa.maximum-capacity": "80"