Skip to content

Instantly share code, notes, and snippets.

@pottava
Last active June 5, 2023 16:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pottava/a20da127883d2bc785c5c80788309926 to your computer and use it in GitHub Desktop.
Save pottava/a20da127883d2bc785c5c80788309926 to your computer and use it in GitHub Desktop.
blueprint_name: gpus
vars:
project_id: ## Set Google Cloud project ID Here ##
deployment_name: ## Set a deployment name Here ##
region: asia-northeast1
zone: asia-northeast1-a
gpu_zones: [asia-northeast1-a, asia-northeast1-c]
enable_reconfigure: true
enable_cleanup_compute: true
labels:
project: gpu-poc
deployment_groups:
- group: primary
modules:
- id: network
source: modules/network/vpc
- id: node_group_a100_1
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: nga1
machine_type: a2-highgpu-1g
bandwidth_tier: gvnic_enabled
gpu:
type: nvidia-tesla-a100
count: 1
node_count_dynamic_max: 8
disk_type: pd-ssd
disk_size_gb: 100
- id: compute_partition_a100_1
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network
- node_group_a100_1
settings:
partition_name: a1
is_default: true
zones: $(vars.gpu_zones)
- id: node_group_a100_2
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: nga2
machine_type: a2-highgpu-2g
bandwidth_tier: gvnic_enabled
gpu:
type: nvidia-tesla-a100
count: 2
node_count_dynamic_max: 8
disk_type: pd-ssd
disk_size_gb: 100
- id: compute_partition_a100_2
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network
- node_group_a100_2
settings:
partition_name: a2
zones: $(vars.gpu_zones)
- id: node_group_a100_4
source: community/modules/compute/schedmd-slurm-gcp-v5-node-group
settings:
name: nga4
machine_type: a2-highgpu-4g
bandwidth_tier: gvnic_enabled
gpu:
type: nvidia-tesla-a100
count: 4
node_count_dynamic_max: 8
disk_type: pd-ssd
disk_size_gb: 100
- id: compute_partition_a100_4
source: community/modules/compute/schedmd-slurm-gcp-v5-partition
use:
- network
- node_group_a100_4
settings:
partition_name: a4
zones: $(vars.gpu_zones)
- id: slurm_controller
source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller
use:
- network
- compute_partition_a100_1
- compute_partition_a100_2
- compute_partition_a100_4
settings:
machine_type: n2-standard-8
disk_size_gb: 500
# https://slurm.schedmd.com/elastic_computing.html
# https://slurm.schedmd.com/power_save.html
cloud_parameters:
resume_rate: 0
resume_timeout: 600
suspend_rate: 0
suspend_timeout: 300
no_comma_params: false
disable_controller_public_ips: true
- id: slurm_login
source: community/modules/scheduler/schedmd-slurm-gcp-v5-login
use:
- network
- slurm_controller
settings:
machine_type: n2-standard-4
disable_login_public_ips: false
- id: hpc_dashboard
source: modules/monitoring/dashboard
outputs: [instructions]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment