Skip to content

Instantly share code, notes, and snippets.

@rom1504
Last active December 4, 2024 04:39
Show Gist options
  • Save rom1504/474f97a95a526d40ae44a3fc3c657a2e to your computer and use it in GitHub Desktop.
Save rom1504/474f97a95a526d40ae44a3fc3c657a2e to your computer and use it in GitHub Desktop.
distributed dalle2 laion
#!/usr/bin/env bash
#sleep 30
#fi_info -p efa -t FI_EP_RDM
# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script
echo myuser=`whoami`
echo COUNT_NODE=$COUNT_NODE
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
echo PATH = $PATH
echo which mpicc `which mpicc`
echo HOSTNAMES = $HOSTNAMES
echo hostname = `hostname`
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT
H=`hostname`
THEID=`echo -e $HOSTNAMES | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
echo THEID=$THEID
source /fsx/dalle2/.dalle_env_38/bin/activate
echo python3 version = `python3 --version`
python -c "import torch"
accelerate launch --num_processes $(( 8 * $COUNT_NODE )) --num_machines $COUNT_NODE --multi_gpu --mixed_precision fp16 --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT /fsx/dalle2/DALLE2-pytorch/train_decoder.py --config_file /fsx/dalle2/testing_config.json
#!/bin/bash
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=intelmpi_test
#SBATCH --nodes 4
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=6
#SBATCH --gres=gpu:8
#SBATCH --output=%x_%j.out
#SBATCH --comment "Key=Monitoring,Value=ON"
#SBATCH --exclusive
module load intelmpi
source /opt/intel/mpi/latest/env/vars.sh
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
export NCCL_PROTO=simple
export PATH=/opt/amazon/efa/bin:$PATH
export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"
export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn
#export NCCL_ALGO=ring
export NCCL_DEBUG=info
#export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL
export PYTHONFAULTHANDLER=1
export CUDA_LAUNCH_BLOCKING=0
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export NCCL_TREE_THRESHOLD=0
#export NCCL_P2P_DISABLE=1
#export NCCL_IBEXT_DISABLE=1
#export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
# sent to sub script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`
echo go $COUNT_NODE
echo $HOSTNAMES
mpirun -n $COUNT_NODE -perhost 1 /fsx/dalle2/start_in_container.sh
{
"seed": 1,
"decoder": {
"unets": [
{
"dim": 256,
"cond_dim": 512,
"image_embed_dim": 768,
"text_embed_dim": 768,
"cond_on_text_encodings": true,
"channels": 3,
"dim_mults": [1, 2, 3, 4],
"num_resnet_blocks": 4,
"attn_heads": 8,
"attn_dim_head": 64,
"sparse_attn": true,
"memory_efficient": true,
"self_attn": [false, true, true, true]
}
],
"clip": {
"make": "openai",
"model": "ViT-L/14"
},
"image_sizes": [64],
"channels": 3,
"timesteps": 1000,
"loss_type": "l2",
"beta_schedule": ["cosine"],
"learned_variance": true
},
"data": {
"webdataset_base_url": "pipe:aws s3 cp --quiet s3://s-datasets/laion-aesthetic/data/laion2B-en-aesthetic/{}.tar -",
"embeddings_url": "s3://s-datasets/laion-aesthetic/ordered_embeddings/",
"num_workers": 12,
"batch_size": 60,
"start_shard": 0,
"end_shard": 5247,
"shard_width": 5,
"index_width": 4,
"splits": {
"train": 0.75,
"val": 0.15,
"test": 0.1
},
"shuffle_train": false,
"resample_train": true,
"preprocessing": {
"RandomResizedCrop": {
"size": [64, 64],
"scale": [0.75, 1.0],
"ratio": [1.0, 1.0]
},
"ToTensor": true
}
},
"train": {
"epochs": 1000,
"lr": 1e-4,
"wd": 0.01,
"max_grad_norm": 0.5,
"save_every_n_samples": 2000000,
"n_sample_images": 10,
"device": "cuda:0",
"epoch_samples": 8000000,
"validation_samples": 100000,
"use_ema": true,
"ema_beta": 0.99,
"save_all": false,
"save_latest": true,
"save_best": true,
"unet_training_mask": [true]
},
"evaluate": {
"n_evaluation_samples": 30,
"FID": {
"feature": 64
},
"LPIPS": {
"net_type": "vgg",
"reduction": "mean"
}
},
"tracker": {
"data_path": ".tracker-testing",
"overwrite_data_path": true,
"log": {
"log_type": "wandb",
"wandb_entity": "rom1504",
"wandb_project": "dalle2_train_decoder",
"wandb_resume": false,
"verbose": true
},
"load": {
"resume": false,
"load_from": null
},
"save": [{
"save_to": "wandb"
}]
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment