rom1504/distributed_dalle2_laion.md

## distributed_dalle2_laion.md

      
    Raw
  

              distributed_dalle2_laion.md
            
          
    https://wandb.ai/rom1504/dalle2_train_decoder/runs/mic5buox/files/decoder_config.json
get dalle2
get the config file
get these 2 .sh
run sbatch start_big.sh
to check squeue
to cancel scancel job_id
check the output by finding the output file with ls -lt
example run https://wandb.ai/rom1504/dalle2_train_decoder/runs/mic5buox?workspace=user-rom1504

  
## start_in_container.sh
#!/usr/bin/env bash
#sleep 30
#fi_info -p efa -t FI_EP_RDM

# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script

echo myuser=`whoami`
echo COUNT_NODE=$COUNT_NODE
echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
echo PATH = $PATH
echo which mpicc `which mpicc`
echo HOSTNAMES = $HOSTNAMES
echo hostname = `hostname`
echo MASTER_ADDR= $MASTER_ADDR
echo MASTER_PORT= $MASTER_PORT

H=`hostname`
THEID=`echo -e $HOSTNAMES  | python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
echo THEID=$THEID


source /fsx/dalle2/.dalle_env_38/bin/activate
echo python3 version = `python3 --version`
python -c "import torch"
accelerate launch --num_processes $(( 8 * $COUNT_NODE )) --num_machines $COUNT_NODE --multi_gpu --mixed_precision fp16 --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT /fsx/dalle2/DALLE2-pytorch/train_decoder.py --config_file /fsx/dalle2/testing_config.json

## start_intelmpi.sh
#!/bin/bash
#SBATCH --partition=compute-od-gpu
#SBATCH --job-name=intelmpi_test
#SBATCH --nodes 4
#SBATCH --ntasks-per-node 1
#SBATCH --cpus-per-gpu=6
#SBATCH --gres=gpu:8
#SBATCH --output=%x_%j.out
#SBATCH --comment "Key=Monitoring,Value=ON"
#SBATCH --exclusive


module load intelmpi
source /opt/intel/mpi/latest/env/vars.sh
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
export NCCL_PROTO=simple
export PATH=/opt/amazon/efa/bin:$PATH
export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"

export FI_EFA_FORK_SAFE=1
export FI_LOG_LEVEL=1
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn

#export NCCL_ALGO=ring
export NCCL_DEBUG=info
#export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL

export PYTHONFAULTHANDLER=1

export CUDA_LAUNCH_BLOCKING=0
export OMPI_MCA_mtl_base_verbose=1
export FI_EFA_ENABLE_SHM_TRANSFER=0
export FI_PROVIDER=efa
export FI_EFA_TX_MIN_CREDITS=64
export NCCL_TREE_THRESHOLD=0


#export NCCL_P2P_DISABLE=1

#export NCCL_IBEXT_DISABLE=1
#export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"

# sent to sub script
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_PORT=12802
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l`

echo go $COUNT_NODE
echo $HOSTNAMES

mpirun -n $COUNT_NODE -perhost 1 /fsx/dalle2/start_in_container.sh

## testing_config.json
{
    "seed": 1,

    "decoder": {
        "unets": [
            {
                "dim": 256,
                "cond_dim": 512,
                "image_embed_dim": 768,
                "text_embed_dim": 768,
                "cond_on_text_encodings": true,
                "channels": 3,
                "dim_mults": [1, 2, 3, 4],
                "num_resnet_blocks": 4,
                "attn_heads": 8,
                "attn_dim_head": 64,
                "sparse_attn": true,
                "memory_efficient": true,
                        "self_attn": [false, true, true, true]
            }
        ],
        "clip": {
            "make": "openai",
            "model": "ViT-L/14"
        },
        "image_sizes": [64],
        "channels": 3,
        "timesteps": 1000,
        "loss_type": "l2",
        "beta_schedule": ["cosine"],
        "learned_variance": true
    },
    "data": {
        "webdataset_base_url": "pipe:aws s3 cp --quiet s3://s-datasets/laion-aesthetic/data/laion2B-en-aesthetic/{}.tar -",
        "embeddings_url": "s3://s-datasets/laion-aesthetic/ordered_embeddings/",
        "num_workers": 12,
        "batch_size": 60,
        "start_shard": 0,
        "end_shard": 5247,
        "shard_width": 5,
        "index_width": 4,
        "splits": {
            "train": 0.75,
            "val": 0.15,
            "test": 0.1
        },
        "shuffle_train": false,
        "resample_train": true,
        "preprocessing": {
            "RandomResizedCrop": {
                "size": [64, 64],
                "scale": [0.75, 1.0],
                "ratio": [1.0, 1.0]
            },
            "ToTensor": true
        }
    },
    "train": {
        "epochs": 1000,
        "lr": 1e-4,
        "wd": 0.01,
        "max_grad_norm": 0.5,
        "save_every_n_samples": 2000000,
        "n_sample_images": 10,
        "device": "cuda:0",
        "epoch_samples": 8000000,
        "validation_samples": 100000,
        "use_ema": true,
        "ema_beta": 0.99,
        "save_all": false,
        "save_latest": true,
        "save_best": true,
        "unet_training_mask": [true]
    },
    "evaluate": {
        "n_evaluation_samples": 30,
        "FID": {
            "feature": 64
        },
        "LPIPS": {
            "net_type": "vgg",
            "reduction": "mean"
        }
    },
    "tracker": {
        "data_path": ".tracker-testing",
        "overwrite_data_path": true,

        "log": {
            "log_type": "wandb",

            "wandb_entity": "rom1504",
            "wandb_project": "dalle2_train_decoder",
            "wandb_resume": false,

            "verbose": true
        },

        "load": {
            "resume": false,
            "load_from": null
        },

        "save": [{
            "save_to": "wandb"
        }]
    }
}
	#!/usr/bin/env bash
	#sleep 30
	#fi_info -p efa -t FI_EP_RDM

	# HOSTNAMES MASTER_ADDR MASTER_PORT COUNT_NODE are coming from the main script

	echo myuser=`whoami`
	echo COUNT_NODE=$COUNT_NODE
	echo LD_LIBRARY_PATH = $LD_LIBRARY_PATH
	echo PATH = $PATH
	echo which mpicc `which mpicc`
	echo HOSTNAMES = $HOSTNAMES
	echo hostname = `hostname`
	echo MASTER_ADDR= $MASTER_ADDR
	echo MASTER_PORT= $MASTER_PORT

	H=`hostname`
	THEID=`echo -e $HOSTNAMES \| python3 -c "import sys;[sys.stdout.write(str(i)) for i,line in enumerate(next(sys.stdin).split(' ')) if line.strip() == '$H'.strip()]"`
	echo THEID=$THEID



	source /fsx/dalle2/.dalle_env_38/bin/activate
	echo python3 version = `python3 --version`
	python -c "import torch"
	accelerate launch --num_processes $(( 8 * $COUNT_NODE )) --num_machines $COUNT_NODE --multi_gpu --mixed_precision fp16 --machine_rank $THEID --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT /fsx/dalle2/DALLE2-pytorch/train_decoder.py --config_file /fsx/dalle2/testing_config.json
	#!/bin/bash
	#SBATCH --partition=compute-od-gpu
	#SBATCH --job-name=intelmpi_test
	#SBATCH --nodes 4
	#SBATCH --ntasks-per-node 1
	#SBATCH --cpus-per-gpu=6
	#SBATCH --gres=gpu:8
	#SBATCH --output=%x_%j.out
	#SBATCH --comment "Key=Monitoring,Value=ON"
	#SBATCH --exclusive


	module load intelmpi
	source /opt/intel/mpi/latest/env/vars.sh
	export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.0/efa/lib:/usr/local/cuda-11.0/lib:/usr/local/cuda-11.0/lib64:/usr/local/cuda-11.0:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH
	export NCCL_PROTO=simple
	export PATH=/opt/amazon/efa/bin:$PATH
	export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so"

	export FI_EFA_FORK_SAFE=1
	export FI_LOG_LEVEL=1
	export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn

	#export NCCL_ALGO=ring
	export NCCL_DEBUG=info
	#export NCCL_DEBUG_SUBSYS=INIT,ENV,GRAPH,COLL

	export PYTHONFAULTHANDLER=1

	export CUDA_LAUNCH_BLOCKING=0
	export OMPI_MCA_mtl_base_verbose=1
	export FI_EFA_ENABLE_SHM_TRANSFER=0
	export FI_PROVIDER=efa
	export FI_EFA_TX_MIN_CREDITS=64
	export NCCL_TREE_THRESHOLD=0


	#export NCCL_P2P_DISABLE=1

	#export NCCL_IBEXT_DISABLE=1
	#export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"

	# sent to sub script
	export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"`
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export MASTER_PORT=12802
	export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" \| wc -l`

	echo go $COUNT_NODE
	echo $HOSTNAMES

	mpirun -n $COUNT_NODE -perhost 1 /fsx/dalle2/start_in_container.sh
	{
	"seed": 1,

	"decoder": {
	"unets": [
	{
	"dim": 256,
	"cond_dim": 512,
	"image_embed_dim": 768,
	"text_embed_dim": 768,
	"cond_on_text_encodings": true,
	"channels": 3,
	"dim_mults": [1, 2, 3, 4],
	"num_resnet_blocks": 4,
	"attn_heads": 8,
	"attn_dim_head": 64,
	"sparse_attn": true,
	"memory_efficient": true,
	"self_attn": [false, true, true, true]
	}
	],
	"clip": {
	"make": "openai",
	"model": "ViT-L/14"
	},
	"image_sizes": [64],
	"channels": 3,
	"timesteps": 1000,
	"loss_type": "l2",
	"beta_schedule": ["cosine"],
	"learned_variance": true
	},
	"data": {
	"webdataset_base_url": "pipe:aws s3 cp --quiet s3://s-datasets/laion-aesthetic/data/laion2B-en-aesthetic/{}.tar -",
	"embeddings_url": "s3://s-datasets/laion-aesthetic/ordered_embeddings/",
	"num_workers": 12,
	"batch_size": 60,
	"start_shard": 0,
	"end_shard": 5247,
	"shard_width": 5,
	"index_width": 4,
	"splits": {
	"train": 0.75,
	"val": 0.15,
	"test": 0.1
	},
	"shuffle_train": false,
	"resample_train": true,
	"preprocessing": {
	"RandomResizedCrop": {
	"size": [64, 64],
	"scale": [0.75, 1.0],
	"ratio": [1.0, 1.0]
	},
	"ToTensor": true
	}
	},
	"train": {
	"epochs": 1000,
	"lr": 1e-4,
	"wd": 0.01,
	"max_grad_norm": 0.5,
	"save_every_n_samples": 2000000,
	"n_sample_images": 10,
	"device": "cuda:0",
	"epoch_samples": 8000000,
	"validation_samples": 100000,
	"use_ema": true,
	"ema_beta": 0.99,
	"save_all": false,
	"save_latest": true,
	"save_best": true,
	"unet_training_mask": [true]
	},
	"evaluate": {
	"n_evaluation_samples": 30,
	"FID": {
	"feature": 64
	},
	"LPIPS": {
	"net_type": "vgg",
	"reduction": "mean"
	}
	},
	"tracker": {
	"data_path": ".tracker-testing",
	"overwrite_data_path": true,

	"log": {
	"log_type": "wandb",

	"wandb_entity": "rom1504",
	"wandb_project": "dalle2_train_decoder",
	"wandb_resume": false,

	"verbose": true
	},

	"load": {
	"resume": false,
	"load_from": null
	},

	"save": [{
	"save_to": "wandb"
	}]
	}
	}