Skip to content

Instantly share code, notes, and snippets.

@mehdidc
Created February 29, 2024 16:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mehdidc/d82da9f097f5718df381cc9f9f298eb2 to your computer and use it in GitHub Desktop.
Save mehdidc/d82da9f097f5718df381cc9f9f298eb2 to your computer and use it in GitHub Desktop.
#!/bin/bash -x
#SBATCH --account={account}
#SBATCH --nodes={nodes}
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=24
#SBATCH --time=06:00:00
#SBATCH --partition={partition}
#SBATCH --output={output_file}
echo "Job Id:$SLURM_JOB_ID"
ml purge
export TRANSFORMERS_CACHE=cache
export TRANSFORMERS_OFFLINE=1
source /p/project/ccstdl/laion/mamba/bin/activate experimental-torch-nightly
export CUDA_VISIBLE_DEVICES=0,1,2,3
export MASTER_PORT=12802
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr"i"
echo "MASTER_ADDR="$MASTER_ADDR
export PYTHONPATH="$PYTHONPATH:$PWD/src"
srun --cpu_bind=none,v --accel-bind=gn python -u src/training/main.py \
--save-frequency 1 \
--zeroshot-frequency 1 \
--train-data="{train_data}" --dataset-type webdataset --dataset-resampled\
--train-num-samples={train_num_samples} \
--batch-size {batch_size} \
--report-to=tensorboard \
--epochs {epochs} \
--workers=8 \
--model {model}\
--name {name} \
--logs {logs} \
--seed 0 \
--local-loss \
--gather-with-grad \
--lr {lr} \
--beta1 {beta1} \
--beta2 {beta2} \
--wd {wd} \
--warmup {warmup} \
--grad-clip-norm {grad_clip_norm} \
--save-most-recent \
--ddp-static-graph \
--precision amp_bfloat16 \
--grad-checkpoint \
--resume latest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment