Skip to content

Instantly share code, notes, and snippets.

@sameerz
Created October 17, 2020 01:38
Show Gist options
  • Save sameerz/d655ff262a295e66b3384fcbd15b67dd to your computer and use it in GitHub Desktop.
Save sameerz/d655ff262a295e66b3384fcbd15b67dd to your computer and use it in GitHub Desktop.
GCP Dataproc spark-rapids initialization script
#!/bin/bash
set -euxo pipefail
export REGION=europe-west4
export ZONE=europe-west4-c
export GCS_BUCKET=[BUCKET CONTAINING MORTGAGE ETL DATA]
export CLUSTER_NAME=[CLUSTER NAME]
export NUM_GPUS=1
export NUM_WORKERS=2
gcloud dataproc clusters create $CLUSTER_NAME \
--region $REGION \
--zone $ZONE \
--image-version=preview-ubuntu18 \
--master-machine-type n1-standard-8 \
--master-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--num-workers $NUM_WORKERS \
--worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--worker-machine-type n1-standard-8 \
--num-worker-local-ssds 4 \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-${REGION}/rapids/rapids.sh \
--optional-components=JUPYTER,ZEPPELIN \
--metadata gpu-driver-provider="NVIDIA" \
--metadata rapids-runtime=SPARK \
--bucket ${GCS_BUCKET} \
--enable-component-gateway \
--properties="^#^spark:spark.yarn.unmanagedAM.enabled=false"
#!/bin/bash
set -euxo pipefail
export REGION=europe-west4
export ZONE=europe-west4-c
export GCS_BUCKET=[BUCKET CONTAINING MORTGAGE ETL DATA]
export CLUSTER_NAME=[CLUSTER NAME]
export NUM_GPUS=1
export NUM_WORKERS=2
gcloud dataproc clusters create $CLUSTER_NAME \
--region $REGION \
--zone $ZONE \
--image-version=preview-ubuntu18 \
--master-machine-type n1-standard-4 \
--num-workers $NUM_WORKERS \
--worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--worker-machine-type n1-standard-8 \
--num-worker-local-ssds 4 \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-${REGION}/rapids/rapids.sh \
--optional-components=JUPYTER,ZEPPELIN \
--metadata gpu-driver-provider="NVIDIA" \
--metadata rapids-runtime=SPARK \
--bucket ${GCS_BUCKET} \
--enable-component-gateway \
--properties="^#^spark:spark.yarn.unmanagedAM.enabled=false"
#!/bin/bash
set -euxo pipefail
export REGION=europe-west4
export ZONE=europe-west4-c
export GCS_BUCKET=sraheja-test
export CLUSTER_NAME=sr-test9
export NUM_GPUS=1
export NUM_WORKERS=2
gcloud dataproc clusters create $CLUSTER_NAME \
--region $REGION \
--zone $ZONE \
--image-version=preview-ubuntu18 \
--master-machine-type n1-standard-8 \
--master-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--num-workers $NUM_WORKERS \
--worker-accelerator type=nvidia-tesla-t4,count=$NUM_GPUS \
--worker-machine-type n1-standard-8 \
--num-worker-local-ssds 4 \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/gpu/install_gpu_driver.sh,gs://goog-dataproc-initialization-actions-${REGION}/rapids/rapids.sh \
--optional-components=JUPYTER,ZEPPELIN \
--metadata gpu-driver-provider="NVIDIA" \
--metadata rapids-runtime=SPARK,cuda-version=11.0 \
--bucket ${GCS_BUCKET} \
--enable-component-gateway \
--properties="^#^spark:spark.yarn.unmanagedAM.enabled=false"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment