- Start an EMR cluster
Make sure we have sufficient disk space to build images and 1 core node for easy debugging.
Also, let's configure the basics of our environment.
KEYPAIR=<ssh_keypair>
LOG_BUCKET=aws-logs-<ACCOUNT_ID>-us-west-2
SUBNET_ID=<subnet_id>
REGION=<REGION>
LIVY_STEP_SCRIPT=s3://<BUCKET>/artifacts/steps/change_livy_timeout.sh
ACCOUNT_ID=<ACCOUNT_ID>
aws emr create-cluster --name "emr-docker-spark" \
--region us-west-2 \
--release-label emr-6.3.0 \
--ebs-root-volume-size 100 \
--enable-debugging \
--log-uri "s3n://${LOG_BUCKET}/elasticmapreduce/" \
--applications Name=Spark Name=Livy Name=JupyterEnterpriseGateway \
--ec2-attributes KeyName=${KEYPAIR},SubnetId=${SUBNET_ID} \
--use-default-roles \
--steps '[{"Type":"CUSTOM_JAR","Name":"IncreaseLivySessionTimeout","ActionOnFailure":"CONTINUE","Jar":"s3://'${REGION}'.elasticmapreduce/libs/script-runner/script-runner.jar","Args":["'${LIVY_STEP_SCRIPT}'"]}]' \
--instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":75,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"CORE"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"MASTER"}]' \
--configurations '[
{
"Classification": "container-executor",
"Configurations": [
{
"Classification": "docker",
"Properties": {
"docker.trusted.registries": "local,'${ACCOUNT_ID}'.dkr.ecr.'${REGION}'.amazonaws.com",
"docker.privileged-containers.registries": "local,'${ACCOUNT_ID}'.dkr.ecr.'${REGION}'.amazonaws.com"
}
}
]
},
{
"Classification":"livy-conf",
"Properties":{
"livy.spark.master":"yarn",
"livy.server.session.timeout":"16h",
"livy.server.yarn.app-lookup-timeout": "600s",
"livy.rsc.server.connect.timeout": "600s"
}
},
{
"Classification":"spark-defaults",
"Properties":{
"spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE":"docker",
"spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE":"docker",
"spark.executor.instances":"2",
"spark.pyspark.virtualenv.enabled": "false"
}
}
]'
- SSH into a core node and build/push our Python 3.6 image
sudo docker build -t local/nvidia-tf1-example-py36 -f nvidia-py36.dockerfile .
sudo docker tag local/nvidia-tf1-example-py36 <ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/emr-docker-nvidia-tf1:nvidia-tf1-example-py36
aws ecr get-login-password --region us-west-2 | sudo docker login --username AWS --password-stdin <ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com
sudo docker push <ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/emr-docker-nvidia-tf1:nvidia-tf1-example-py36
- Configure our Notebook
%%configure -f
{
"conf": {
"spark.submit.deployMode": "cluster",
"spark.yarn.am.waitTime": "300s",
"spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE": "<ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/emr-docker-nvidia-tf1:nvidia-tf1-example-py36",
"spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE": "<ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/emr-docker-nvidia-tf1:nvidia-tf1-example-py36",
"spark.executorEnv.JAVA_HOME": "/usr",
"spark.yarn.appMasterEnv.JAVA_HOME": "/usr"
}
}
- Try to import tensorflow
import tensorflow as tf
print(tf.__version__)
- Run as a job
From the primary node:
DOCKER_IMAGE_NAME=<ACCOUNT_ID>.dkr.ecr.<REGION>.amazonaws.com/emr-docker-nvidia-tf1:nvidia-tf1-example-py36
spark-submit --master yarn \
--deploy-mode cluster \
--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_TYPE=docker \
--conf spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=$DOCKER_IMAGE_NAME \
--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_TYPE=docker \
--conf spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=$DOCKER_IMAGE_NAME \
--conf spark.executorEnv.JAVA_HOME=/usr \
--conf spark.yarn.appMasterEnv.JAVA_HOME=/usr \
--num-executors 2 \
imports.py -v