mehdidc/example.sbatch

## example.sbatch
#!/bin/bash -x
#SBATCH --account=cstdl
#SBATCH --nodes=8
#SBATCH --gres=gpu:4
#SBATCH --ntasks-per-node=4
#SBATCH --cpus-per-task=12
#SBATCH --wait-all-nodes=1
#SBATCH --time=00:30:00
#SBATCH --partition=batch
#SBATCH --job-name=open_clip

# load low-level libraries
ml purge
ml use $OTHERSTAGES
ml Stages/2022
ml GCC/11.2.0
ml OpenMPI/4.1.2
ml CUDA/11.5
ml cuDNN/8.3.1.22-CUDA-11.5
ml NCCL/2.12.7-1-CUDA-11.5
ml PyTorch/1.11-CUDA-11.5
ml torchvision/0.12.0
source envs/hdfml/bin/activate

#export NCCL_DEBUG=INFO
#export NCCL_DEBUG_SUBSYS=ALL
export CUDA_VISIBLE_DEVICES=0,1,2,3
export MASTER_PORT=12802
### get the first node name as master address - customized for vgg slurm
### e.g. master(gnodee[2-5],gnoded1) == gnodee2
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr"i"
echo "MASTER_ADDR="$MASTER_ADDR
export PYTHONPATH="$PYTHONPATH:$PWD/src"

LOGS=logs/example
NAME=ViT-B-32-20M
CKPT="$LOGS/$NAME/checkpoints/epoch_latest.pt"
if test -f "$CKPT"; then
    RESUME="--resume $CKPT"
else
    RESUME=""
fi
srun --cpu_bind=v --accel-bind=gn python -u src/training/main.py \
    --save-frequency 1 \
    --zeroshot-frequency 1 \
    --train-data="/p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar" \
    --imagenet-val="/p/scratch/ccstdl/gordon2/imagenet_val" \
    --train-num-samples=20003822 \
    --warmup 2000 \
    --batch-size=256 \
    --epochs=32 \
    --workers=8 \
    --report-to=tensorboard \
    --model ViT-B-32 \
    --name $NAME \
    --logs $LOGS \
    --seed 0 \
    --lr 5.0e-4 \
    --ddp-static-graph \
    --local-loss \
    --gather-with-grad \
    --save-most-recent \
    $RESUME

## out.log
2022-09-27,10:52:02 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0
2022-09-27,10:52:02 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes.
2022-09-27,10:52:02 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32.
2022-09-27,10:52:02 | INFO | Loading ViT-B-32 model config.
2022-09-27,10:52:05 | INFO | Model:
2022-09-27,10:52:05 | INFO | CLIP(
  (visual): VisualTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (1): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (2): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (3): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (4): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (5): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (6): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (7): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (8): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (9): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (10): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (11): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): Transformer(
    (resblocks): ModuleList(
      (0): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (1): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (2): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (3): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (4): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (5): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (6): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (7): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (8): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (9): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (10): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (11): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
    )
  )
  (token_embedding): Embedding(49408, 512)
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
2022-09-27,10:52:05 | INFO | Params:
2022-09-27,10:52:05 | INFO |   batch_size: 256
2022-09-27,10:52:05 | INFO |   beta1: 0.9
2022-09-27,10:52:05 | INFO |   beta2: 0.98
2022-09-27,10:52:05 | INFO |   checkpoint_path: logs/example/ViT-B-32-20M/checkpoints
2022-09-27,10:52:05 | INFO |   copy_codebase: False
2022-09-27,10:52:05 | INFO |   csv_caption_key: title
2022-09-27,10:52:05 | INFO |   csv_img_key: filepath
2022-09-27,10:52:05 | INFO |   csv_separator:
2022-09-27,10:52:05 | INFO |   dataset_resampled: False
2022-09-27,10:52:05 | INFO |   dataset_type: auto
2022-09-27,10:52:05 | INFO |   ddp_static_graph: True
2022-09-27,10:52:05 | INFO |   debug: False
2022-09-27,10:52:05 | INFO |   device: cuda:0
2022-09-27,10:52:05 | INFO |   dist_backend: nccl
2022-09-27,10:52:05 | INFO |   dist_url: env://
2022-09-27,10:52:05 | INFO |   distributed: True
2022-09-27,10:52:05 | INFO |   epochs: 32
2022-09-27,10:52:05 | INFO |   eps: 1e-06
2022-09-27,10:52:05 | INFO |   force_quick_gelu: False
2022-09-27,10:52:05 | INFO |   gather_with_grad: True
2022-09-27,10:52:05 | INFO |   grad_checkpointing: False
2022-09-27,10:52:05 | INFO |   horovod: False
2022-09-27,10:52:05 | INFO |   image_mean: None
2022-09-27,10:52:05 | INFO |   image_std: None
2022-09-27,10:52:05 | INFO |   imagenet_v2: None
2022-09-27,10:52:05 | INFO |   imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val
2022-09-27,10:52:05 | INFO |   local_loss: True
2022-09-27,10:52:05 | INFO |   local_rank: 0
2022-09-27,10:52:05 | INFO |   lock_image: False
2022-09-27,10:52:05 | INFO |   lock_image_freeze_bn_stats: False
2022-09-27,10:52:05 | INFO |   lock_image_unlocked_groups: 0
2022-09-27,10:52:05 | INFO |   log_level: 20
2022-09-27,10:52:05 | INFO |   log_local: False
2022-09-27,10:52:05 | INFO |   log_path: logs/example/ViT-B-32-20M/out.log
2022-09-27,10:52:05 | INFO |   logs: logs/example
2022-09-27,10:52:05 | INFO |   lr: 0.0005
2022-09-27,10:52:05 | INFO |   model: ViT-B-32
2022-09-27,10:52:05 | INFO |   name: ViT-B-32-20M
2022-09-27,10:52:05 | INFO |   no_set_device_rank: False
2022-09-27,10:52:05 | INFO |   norm_gradient_clip: None
2022-09-27,10:52:05 | INFO |   precision: amp
2022-09-27,10:52:05 | INFO |   pretrained:
2022-09-27,10:52:05 | INFO |   pretrained_image: False
2022-09-27,10:52:05 | INFO |   rank: 0
2022-09-27,10:52:05 | INFO |   report_to: tensorboard
2022-09-27,10:52:05 | INFO |   resume: None
2022-09-27,10:52:05 | INFO |   save_frequency: 1
2022-09-27,10:52:05 | INFO |   save_most_recent: True
2022-09-27,10:52:05 | INFO |   seed: 0
2022-09-27,10:52:05 | INFO |   skip_scheduler: False
2022-09-27,10:52:05 | INFO |   tensorboard: True
2022-09-27,10:52:05 | INFO |   tensorboard_path: logs/example/ViT-B-32-20M/tensorboard
2022-09-27,10:52:05 | INFO |   torchscript: False
2022-09-27,10:52:05 | INFO |   trace: False
2022-09-27,10:52:05 | INFO |   train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar
2022-09-27,10:52:05 | INFO |   train_num_samples: 20003822
2022-09-27,10:52:05 | INFO |   use_bn_sync: False
2022-09-27,10:52:05 | INFO |   val_data: None
2022-09-27,10:52:05 | INFO |   val_frequency: 1
2022-09-27,10:52:05 | INFO |   val_num_samples: None
2022-09-27,10:52:05 | INFO |   wandb: False
2022-09-27,10:52:05 | INFO |   wandb_notes:
2022-09-27,10:52:05 | INFO |   warmup: 2000
2022-09-27,10:52:05 | INFO |   wd: 0.2
2022-09-27,10:52:05 | INFO |   workers: 8
2022-09-27,10:52:05 | INFO |   world_size: 32
2022-09-27,10:52:05 | INFO |   zeroshot_frequency: 1
2022-09-27,10:52:06 | INFO | Start epoch 0
2022-09-27,10:52:18 | INFO | Train Epoch: 0 [    8192/20054016 (0%)] Loss: 9.0813 (9.081) Data (t): 3.241 Batch (t): 12.323, 664.795/s LR: 0.000000 Logit Scale: 14.286
2022-09-27,10:52:19 | INFO | Reducer buckets have been rebuilt in this iteration.
2022-09-27,10:53:04 | INFO | Train Epoch: 0 [  827392/20054016 (4%)] Loss: 8.8624 (8.972) Data (t): 0.073 Batch (t): 0.458, 18055.6/s LR: 0.000025 Logit Scale: 14.283
2022-09-27,10:53:50 | INFO | Train Epoch: 0 [ 1646592/20054016 (8%)] Loss: 8.3184 (8.754) Data (t): 0.074 Batch (t): 0.459, 17636.6/s LR: 0.000050 Logit Scale: 14.282
2022-09-27,10:54:36 | INFO | Train Epoch: 0 [ 2465792/20054016 (12%)] Loss: 8.0655 (8.582) Data (t): 0.074 Batch (t): 0.460, 17725.1/s LR: 0.000075 Logit Scale: 14.297
2022-09-27,10:55:22 | INFO | Train Epoch: 0 [ 3284992/20054016 (16%)] Loss: 7.6955 (8.405) Data (t): 0.073 Batch (t): 0.459, 17984.9/s LR: 0.000100 Logit Scale: 14.367
2022-09-27,10:56:08 | INFO | Train Epoch: 0 [ 4104192/20054016 (20%)] Loss: 7.6081 (8.272) Data (t): 0.074 Batch (t): 0.459, 17907.1/s LR: 0.000125 Logit Scale: 14.476
2022-09-27,10:56:54 | INFO | Train Epoch: 0 [ 4923392/20054016 (25%)] Loss: 7.2475 (8.126) Data (t): 0.074 Batch (t): 0.460, 17617.1/s LR: 0.000150 Logit Scale: 14.633
2022-09-27,10:57:40 | INFO | Train Epoch: 0 [ 5742592/20054016 (29%)] Loss: 6.9686 (7.981) Data (t): 0.073 Batch (t): 0.460, 18007.0/s LR: 0.000175 Logit Scale: 14.818
2022-09-27,10:58:26 | INFO | Train Epoch: 0 [ 6561792/20054016 (33%)] Loss: 6.7461 (7.844) Data (t): 0.074 Batch (t): 0.460, 17910.2/s LR: 0.000200 Logit Scale: 15.067
2022-09-27,10:59:11 | INFO | Train Epoch: 0 [ 7380992/20054016 (37%)] Loss: 6.8749 (7.747) Data (t): 0.073 Batch (t): 0.459, 17859.4/s LR: 0.000225 Logit Scale: 15.353
2022-09-27,10:59:58 | INFO | Train Epoch: 0 [ 8200192/20054016 (41%)] Loss: 6.6846 (7.650) Data (t): 0.074 Batch (t): 0.461, 17568.7/s LR: 0.000250 Logit Scale: 15.703
2022-09-27,11:00:44 | INFO | Train Epoch: 0 [ 9019392/20054016 (45%)] Loss: 6.3567 (7.542) Data (t): 0.074 Batch (t): 0.460, 18002.7/s LR: 0.000275 Logit Scale: 16.078
2022-09-27,11:01:30 | INFO | Train Epoch: 0 [ 9838592/20054016 (49%)] Loss: 6.2377 (7.442) Data (t): 0.074 Batch (t): 0.459, 17845.4/s LR: 0.000300 Logit Scale: 16.525
2022-09-27,11:02:15 | INFO | Train Epoch: 0 [10657792/20054016 (53%)] Loss: 6.1121 (7.347) Data (t): 0.073 Batch (t): 0.460, 18188.0/s LR: 0.000325 Logit Scale: 17.043
2022-09-27,11:04:04 | INFO | Train Epoch: 0 [11476992/20054016 (57%)] Loss: 5.9579 (7.254) Data (t): 0.074 Batch (t): 1.088, 17697.1/s LR: 0.000350 Logit Scale: 17.598
2022-09-27,11:04:50 | INFO | Train Epoch: 0 [12296192/20054016 (61%)] Loss: 5.4793 (7.144) Data (t): 0.074 Batch (t): 0.459, 17887.8/s LR: 0.000375 Logit Scale: 18.224
2022-09-27,11:05:36 | INFO | Train Epoch: 0 [13115392/20054016 (65%)] Loss: 5.6995 (7.059) Data (t): 0.074 Batch (t): 0.460, 17601.5/s LR: 0.000400 Logit Scale: 18.905
2022-09-27,11:06:22 | INFO | Train Epoch: 0 [13934592/20054016 (69%)] Loss: 5.6713 (6.982) Data (t): 0.073 Batch (t): 0.459, 18069.0/s LR: 0.000425 Logit Scale: 19.648
2022-09-27,11:07:08 | INFO | Train Epoch: 0 [14753792/20054016 (74%)] Loss: 5.4377 (6.900) Data (t): 0.074 Batch (t): 0.461, 17819.1/s LR: 0.000450 Logit Scale: 20.300
2022-09-27,11:07:54 | INFO | Train Epoch: 0 [15572992/20054016 (78%)] Loss: 5.2516 (6.818) Data (t): 0.074 Batch (t): 0.460, 18008.3/s LR: 0.000475 Logit Scale: 21.194
2022-09-27,11:08:40 | INFO | Train Epoch: 0 [16392192/20054016 (82%)] Loss: 5.0659 (6.734) Data (t): 0.074 Batch (t): 0.461, 17626.9/s LR: 0.000500 Logit Scale: 22.089
2022-09-27,11:09:27 | INFO | Train Epoch: 0 [17211392/20054016 (86%)] Loss: 5.2005 (6.665) Data (t): 0.073 Batch (t): 0.464, 17665.2/s LR: 0.000500 Logit Scale: 23.083
2022-09-27,11:10:13 | INFO | Train Epoch: 0 [18030592/20054016 (90%)] Loss: 5.1313 (6.598) Data (t): 0.073 Batch (t): 0.461, 17438.2/s LR: 0.000500 Logit Scale: 24.062
2022-09-27,11:10:59 | INFO | Train Epoch: 0 [18849792/20054016 (94%)] Loss: 4.8736 (6.526) Data (t): 0.074 Batch (t): 0.462, 17608.1/s LR: 0.000500 Logit Scale: 25.103
2022-09-27,11:11:46 | INFO | Train Epoch: 0 [19668992/20054016 (98%)] Loss: 4.9132 (6.462) Data (t): 0.075 Batch (t): 0.467, 17932.5/s LR: 0.000500 Logit Scale: 26.143
2022-09-27,11:12:07 | INFO | Train Epoch: 0 [20054016/20054016 (100%)] Loss: 4.7750 (6.397) Data (t): 0.074 Batch (t): 0.462, 18358.2/s LR: 0.000500 Logit Scale: 26.637
2022-09-27,11:12:07 | INFO | Starting zero-shot imagenet.
2022-09-27,11:12:07 | INFO | Building zero-shot classifier
2022-09-27,11:13:02 | INFO | Using classifier
2022-09-27,11:15:48 | INFO | Finished zero-shot imagenet.
2022-09-27,11:15:48 | INFO | Eval Epoch: 1 imagenet-zeroshot-val-top1: 0.0751	imagenet-zeroshot-val-top5: 0.1981
2022-09-27,11:15:52 | INFO | Start epoch 1
2022-09-27,11:15:55 | INFO | Train Epoch: 1 [    8192/20054016 (0%)] Loss: 4.4730 (4.473) Data (t): 2.671 Batch (t): 3.057, 2679.81/s LR: 0.000500 Logit Scale: 26.647
2022-09-27,11:20:41 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0
2022-09-27,11:20:41 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes.
2022-09-27,11:20:41 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32.
2022-09-27,11:20:41 | INFO | Loading ViT-B-32 model config.
2022-09-27,11:20:43 | INFO | Model:
2022-09-27,11:20:43 | INFO | CLIP(
  (visual): VisualTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (1): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (2): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (3): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (4): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (5): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (6): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (7): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (8): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (9): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (10): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (11): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): Transformer(
    (resblocks): ModuleList(
      (0): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (1): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (2): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (3): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (4): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (5): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (6): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (7): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (8): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (9): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (10): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (11): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
    )
  )
  (token_embedding): Embedding(49408, 512)
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
2022-09-27,11:20:43 | INFO | Params:
2022-09-27,11:20:43 | INFO |   batch_size: 256
2022-09-27,11:20:43 | INFO |   beta1: 0.9
2022-09-27,11:20:43 | INFO |   beta2: 0.98
2022-09-27,11:20:43 | INFO |   checkpoint_path: logs/example/ViT-B-32-20M/checkpoints
2022-09-27,11:20:43 | INFO |   copy_codebase: False
2022-09-27,11:20:43 | INFO |   csv_caption_key: title
2022-09-27,11:20:43 | INFO |   csv_img_key: filepath
2022-09-27,11:20:43 | INFO |   csv_separator:
2022-09-27,11:20:43 | INFO |   dataset_resampled: False
2022-09-27,11:20:43 | INFO |   dataset_type: auto
2022-09-27,11:20:43 | INFO |   ddp_static_graph: True
2022-09-27,11:20:43 | INFO |   debug: False
2022-09-27,11:20:43 | INFO |   device: cuda:0
2022-09-27,11:20:43 | INFO |   dist_backend: nccl
2022-09-27,11:20:43 | INFO |   dist_url: env://
2022-09-27,11:20:43 | INFO |   distributed: True
2022-09-27,11:20:43 | INFO |   epochs: 32
2022-09-27,11:20:43 | INFO |   eps: 1e-06
2022-09-27,11:20:43 | INFO |   force_quick_gelu: False
2022-09-27,11:20:43 | INFO |   gather_with_grad: True
2022-09-27,11:20:43 | INFO |   grad_checkpointing: False
2022-09-27,11:20:43 | INFO |   horovod: False
2022-09-27,11:20:43 | INFO |   image_mean: None
2022-09-27,11:20:43 | INFO |   image_std: None
2022-09-27,11:20:43 | INFO |   imagenet_v2: None
2022-09-27,11:20:43 | INFO |   imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val
2022-09-27,11:20:43 | INFO |   local_loss: True
2022-09-27,11:20:43 | INFO |   local_rank: 0
2022-09-27,11:20:43 | INFO |   lock_image: False
2022-09-27,11:20:43 | INFO |   lock_image_freeze_bn_stats: False
2022-09-27,11:20:43 | INFO |   lock_image_unlocked_groups: 0
2022-09-27,11:20:43 | INFO |   log_level: 20
2022-09-27,11:20:43 | INFO |   log_local: False
2022-09-27,11:20:43 | INFO |   log_path: logs/example/ViT-B-32-20M/out.log
2022-09-27,11:20:43 | INFO |   logs: logs/example
2022-09-27,11:20:43 | INFO |   lr: 0.0005
2022-09-27,11:20:43 | INFO |   model: ViT-B-32
2022-09-27,11:20:43 | INFO |   name: ViT-B-32-20M
2022-09-27,11:20:43 | INFO |   no_set_device_rank: False
2022-09-27,11:20:43 | INFO |   norm_gradient_clip: None
2022-09-27,11:20:43 | INFO |   precision: amp
2022-09-27,11:20:43 | INFO |   pretrained:
2022-09-27,11:20:43 | INFO |   pretrained_image: False
2022-09-27,11:20:43 | INFO |   rank: 0
2022-09-27,11:20:43 | INFO |   report_to: tensorboard
2022-09-27,11:20:43 | INFO |   resume: logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt
2022-09-27,11:20:43 | INFO |   save_frequency: 1
2022-09-27,11:20:43 | INFO |   save_most_recent: True
2022-09-27,11:20:43 | INFO |   seed: 0
2022-09-27,11:20:43 | INFO |   skip_scheduler: False
2022-09-27,11:20:43 | INFO |   tensorboard: True
2022-09-27,11:20:43 | INFO |   tensorboard_path: logs/example/ViT-B-32-20M/tensorboard
2022-09-27,11:20:43 | INFO |   torchscript: False
2022-09-27,11:20:43 | INFO |   trace: False
2022-09-27,11:20:43 | INFO |   train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar
2022-09-27,11:20:43 | INFO |   train_num_samples: 20003822
2022-09-27,11:20:43 | INFO |   use_bn_sync: False
2022-09-27,11:20:43 | INFO |   val_data: None
2022-09-27,11:20:43 | INFO |   val_frequency: 1
2022-09-27,11:20:43 | INFO |   val_num_samples: None
2022-09-27,11:20:43 | INFO |   wandb: False
2022-09-27,11:20:43 | INFO |   wandb_notes:
2022-09-27,11:20:43 | INFO |   warmup: 2000
2022-09-27,11:20:43 | INFO |   wd: 0.2
2022-09-27,11:20:43 | INFO |   workers: 8
2022-09-27,11:20:43 | INFO |   world_size: 32
2022-09-27,11:20:43 | INFO |   zeroshot_frequency: 1
2022-09-27,11:20:48 | INFO | => resuming checkpoint 'logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt' (epoch 1)
2022-09-27,11:20:48 | INFO | Start epoch 1
2022-09-27,11:21:00 | INFO | Train Epoch: 1 [    8192/20054016 (0%)] Loss: 4.7262 (4.726) Data (t): 3.628 Batch (t): 12.326, 664.617/s LR: 0.000500 Logit Scale: 26.647
2022-09-27,11:21:01 | INFO | Reducer buckets have been rebuilt in this iteration.
2022-09-27,11:21:46 | INFO | Train Epoch: 1 [  827392/20054016 (4%)] Loss: 4.4406 (4.583) Data (t): 0.072 Batch (t): 0.460, 18013.3/s LR: 0.000500 Logit Scale: 27.731
2022-09-27,11:22:32 | INFO | Train Epoch: 1 [ 1646592/20054016 (8%)] Loss: 4.3635 (4.510) Data (t): 0.073 Batch (t): 0.460, 17848.4/s LR: 0.000500 Logit Scale: 28.743
2022-09-27,11:23:19 | INFO | Train Epoch: 1 [ 2465792/20054016 (12%)] Loss: 4.6087 (4.535) Data (t): 0.072 Batch (t): 0.461, 17616.1/s LR: 0.000500 Logit Scale: 29.586
2022-09-27,11:24:05 | INFO | Train Epoch: 1 [ 3284992/20054016 (16%)] Loss: 4.5415 (4.536) Data (t): 0.073 Batch (t): 0.461, 17434.1/s LR: 0.000500 Logit Scale: 30.362
2022-09-27,11:24:51 | INFO | Train Epoch: 1 [ 4104192/20054016 (20%)] Loss: 4.3388 (4.503) Data (t): 0.073 Batch (t): 0.461, 17804.6/s LR: 0.000500 Logit Scale: 31.230
2022-09-27,11:25:37 | INFO | Train Epoch: 1 [ 4923392/20054016 (25%)] Loss: 4.2594 (4.468) Data (t): 0.073 Batch (t): 0.460, 17900.5/s LR: 0.000500 Logit Scale: 31.951
2022-09-27,11:26:23 | INFO | Train Epoch: 1 [ 5742592/20054016 (29%)] Loss: 4.1226 (4.425) Data (t): 0.073 Batch (t): 0.462, 17497.4/s LR: 0.000500 Logit Scale: 32.769
2022-09-27,11:27:09 | INFO | Train Epoch: 1 [ 6561792/20054016 (33%)] Loss: 3.9446 (4.372) Data (t): 0.073 Batch (t): 0.461, 17653.3/s LR: 0.000500 Logit Scale: 33.478
2022-09-27,11:27:55 | INFO | Train Epoch: 1 [ 7380992/20054016 (37%)] Loss: 3.9022 (4.325) Data (t): 0.073 Batch (t): 0.461, 17965.8/s LR: 0.000500 Logit Scale: 34.093
2022-09-27,11:28:41 | INFO | Train Epoch: 1 [ 8200192/20054016 (41%)] Loss: 3.9148 (4.288) Data (t): 0.073 Batch (t): 0.462, 18020.7/s LR: 0.000500 Logit Scale: 34.800
2022-09-27,11:29:28 | INFO | Train Epoch: 1 [ 9019392/20054016 (45%)] Loss: 3.8163 (4.248) Data (t): 0.073 Batch (t): 0.461, 17457.7/s LR: 0.000499 Logit Scale: 35.438
2022-09-27,11:30:14 | INFO | Train Epoch: 1 [ 9838592/20054016 (49%)] Loss: 3.7082 (4.207) Data (t): 0.073 Batch (t): 0.462, 17738.3/s LR: 0.000499 Logit Scale: 36.000
2022-09-27,11:31:00 | INFO | Train Epoch: 1 [10657792/20054016 (53%)] Loss: 3.4084 (4.150) Data (t): 0.074 Batch (t): 0.462, 17661.3/s LR: 0.000499 Logit Scale: 36.571
2022-09-27,11:31:46 | INFO | Train Epoch: 1 [11476992/20054016 (57%)] Loss: 3.5990 (4.113) Data (t): 0.073 Batch (t): 0.461, 17817.6/s LR: 0.000499 Logit Scale: 37.060
2022-09-27,11:32:32 | INFO | Train Epoch: 1 [12296192/20054016 (61%)] Loss: 3.8248 (4.095) Data (t): 0.073 Batch (t): 0.461, 17991.9/s LR: 0.000499 Logit Scale: 37.553
2022-09-27,11:33:18 | INFO | Train Epoch: 1 [13115392/20054016 (65%)] Loss: 3.3610 (4.052) Data (t): 0.073 Batch (t): 0.462, 17919.3/s LR: 0.000499 Logit Scale: 38.000
2022-09-27,11:34:05 | INFO | Train Epoch: 1 [13934592/20054016 (69%)] Loss: 3.5046 (4.021) Data (t): 0.073 Batch (t): 0.463, 17606.2/s LR: 0.000499 Logit Scale: 38.528
2022-09-27,11:34:51 | INFO | Train Epoch: 1 [14753792/20054016 (74%)] Loss: 3.5596 (3.997) Data (t): 0.073 Batch (t): 0.461, 17863.1/s LR: 0.000499 Logit Scale: 39.018
2022-09-27,11:35:37 | INFO | Train Epoch: 1 [15572992/20054016 (78%)] Loss: 3.5188 (3.973) Data (t): 0.073 Batch (t): 0.462, 17523.2/s LR: 0.000499 Logit Scale: 39.465
2022-09-27,11:36:23 | INFO | Train Epoch: 1 [16392192/20054016 (82%)] Loss: 3.1615 (3.935) Data (t): 0.073 Batch (t): 0.461, 17629.2/s LR: 0.000499 Logit Scale: 39.938
2022-09-27,11:37:09 | INFO | Train Epoch: 1 [17211392/20054016 (86%)] Loss: 3.2076 (3.901) Data (t): 0.074 Batch (t): 0.463, 17572.2/s LR: 0.000499 Logit Scale: 40.329
2022-09-27,11:37:56 | INFO | Train Epoch: 1 [18030592/20054016 (90%)] Loss: 3.1250 (3.868) Data (t): 0.073 Batch (t): 0.462, 17993.2/s LR: 0.000499 Logit Scale: 40.778
2022-09-27,11:38:42 | INFO | Train Epoch: 1 [18849792/20054016 (94%)] Loss: 3.4095 (3.849) Data (t): 0.074 Batch (t): 0.463, 17656.7/s LR: 0.000498 Logit Scale: 41.177
2022-09-27,11:39:29 | INFO | Train Epoch: 1 [19668992/20054016 (98%)] Loss: 3.0969 (3.819) Data (t): 0.076 Batch (t): 0.470, 17204.0/s LR: 0.000498 Logit Scale: 41.611
2022-09-27,11:39:51 | INFO | Train Epoch: 1 [20054016/20054016 (100%)] Loss: 3.1594 (3.793) Data (t): 0.075 Batch (t): 0.464, 18190.3/s LR: 0.000498 Logit Scale: 41.765
2022-09-27,11:39:51 | INFO | Starting zero-shot imagenet.
2022-09-27,11:39:51 | INFO | Building zero-shot classifier
2022-09-27,11:40:45 | INFO | Using classifier
2022-09-27,11:43:39 | INFO | Finished zero-shot imagenet.
2022-09-27,11:43:39 | INFO | Eval Epoch: 2 imagenet-zeroshot-val-top1: 0.1621	imagenet-zeroshot-val-top5: 0.3525
2022-09-27,11:43:43 | INFO | Start epoch 2
2022-09-27,11:43:45 | INFO | Train Epoch: 2 [    8192/20054016 (0%)] Loss: 2.3822 (2.382) Data (t): 1.644 Batch (t): 2.037, 4020.91/s LR: 0.000498 Logit Scale: 41.771
2022-09-27,11:50:43 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0
2022-09-27,11:50:43 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes.
2022-09-27,11:50:43 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32.
2022-09-27,11:50:43 | INFO | Loading ViT-B-32 model config.
2022-09-27,11:50:45 | INFO | Model:
2022-09-27,11:50:45 | INFO | CLIP(
  (visual): VisualTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (1): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (2): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (3): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (4): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (5): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (6): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (7): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (8): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (9): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (10): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
        (11): ResidualAttentionBlock(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_attn): Identity()
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (ln): Identity()
            (gelu): GELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (transformer): Transformer(
    (resblocks): ModuleList(
      (0): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (1): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (2): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (3): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (4): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (5): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (6): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (7): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (8): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (9): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (10): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
      (11): ResidualAttentionBlock(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_attn): Identity()
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (ln): Identity()
          (gelu): GELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
      )
    )
  )
  (token_embedding): Embedding(49408, 512)
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)
2022-09-27,11:50:45 | INFO | Params:
2022-09-27,11:50:45 | INFO |   batch_size: 256
2022-09-27,11:50:45 | INFO |   beta1: 0.9
2022-09-27,11:50:45 | INFO |   beta2: 0.98
2022-09-27,11:50:45 | INFO |   checkpoint_path: logs/example/ViT-B-32-20M/checkpoints
2022-09-27,11:50:45 | INFO |   copy_codebase: False
2022-09-27,11:50:45 | INFO |   csv_caption_key: title
2022-09-27,11:50:45 | INFO |   csv_img_key: filepath
2022-09-27,11:50:45 | INFO |   csv_separator:
2022-09-27,11:50:45 | INFO |   dataset_resampled: False
2022-09-27,11:50:45 | INFO |   dataset_type: auto
2022-09-27,11:50:45 | INFO |   ddp_static_graph: True
2022-09-27,11:50:45 | INFO |   debug: False
2022-09-27,11:50:45 | INFO |   device: cuda:0
2022-09-27,11:50:45 | INFO |   dist_backend: nccl
2022-09-27,11:50:45 | INFO |   dist_url: env://
2022-09-27,11:50:45 | INFO |   distributed: True
2022-09-27,11:50:45 | INFO |   epochs: 32
2022-09-27,11:50:45 | INFO |   eps: 1e-06
2022-09-27,11:50:45 | INFO |   force_quick_gelu: False
2022-09-27,11:50:45 | INFO |   gather_with_grad: True
2022-09-27,11:50:45 | INFO |   grad_checkpointing: False
2022-09-27,11:50:45 | INFO |   horovod: False
2022-09-27,11:50:45 | INFO |   image_mean: None
2022-09-27,11:50:45 | INFO |   image_std: None
2022-09-27,11:50:45 | INFO |   imagenet_v2: None
2022-09-27,11:50:45 | INFO |   imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val
2022-09-27,11:50:45 | INFO |   local_loss: True
2022-09-27,11:50:45 | INFO |   local_rank: 0
2022-09-27,11:50:45 | INFO |   lock_image: False
2022-09-27,11:50:45 | INFO |   lock_image_freeze_bn_stats: False
2022-09-27,11:50:45 | INFO |   lock_image_unlocked_groups: 0
2022-09-27,11:50:45 | INFO |   log_level: 20
2022-09-27,11:50:45 | INFO |   log_local: False
2022-09-27,11:50:45 | INFO |   log_path: logs/example/ViT-B-32-20M/out.log
2022-09-27,11:50:45 | INFO |   logs: logs/example
2022-09-27,11:50:45 | INFO |   lr: 0.0005
2022-09-27,11:50:45 | INFO |   model: ViT-B-32
2022-09-27,11:50:45 | INFO |   name: ViT-B-32-20M
2022-09-27,11:50:45 | INFO |   no_set_device_rank: False
2022-09-27,11:50:45 | INFO |   norm_gradient_clip: None
2022-09-27,11:50:45 | INFO |   precision: amp
2022-09-27,11:50:45 | INFO |   pretrained:
2022-09-27,11:50:45 | INFO |   pretrained_image: False
2022-09-27,11:50:45 | INFO |   rank: 0
2022-09-27,11:50:45 | INFO |   report_to: tensorboard
2022-09-27,11:50:45 | INFO |   resume: logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt
2022-09-27,11:50:45 | INFO |   save_frequency: 1
2022-09-27,11:50:45 | INFO |   save_most_recent: True
2022-09-27,11:50:45 | INFO |   seed: 0
2022-09-27,11:50:45 | INFO |   skip_scheduler: False
2022-09-27,11:50:45 | INFO |   tensorboard: True
2022-09-27,11:50:45 | INFO |   tensorboard_path: logs/example/ViT-B-32-20M/tensorboard
2022-09-27,11:50:45 | INFO |   torchscript: False
2022-09-27,11:50:45 | INFO |   trace: False
2022-09-27,11:50:45 | INFO |   train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar
2022-09-27,11:50:45 | INFO |   train_num_samples: 20003822
2022-09-27,11:50:45 | INFO |   use_bn_sync: False
2022-09-27,11:50:45 | INFO |   val_data: None
2022-09-27,11:50:45 | INFO |   val_frequency: 1
2022-09-27,11:50:45 | INFO |   val_num_samples: None
2022-09-27,11:50:45 | INFO |   wandb: False
2022-09-27,11:50:45 | INFO |   wandb_notes:
2022-09-27,11:50:45 | INFO |   warmup: 2000
2022-09-27,11:50:45 | INFO |   wd: 0.2
2022-09-27,11:50:45 | INFO |   workers: 8
2022-09-27,11:50:45 | INFO |   world_size: 32
2022-09-27,11:50:45 | INFO |   zeroshot_frequency: 1
2022-09-27,11:50:49 | INFO | => resuming checkpoint 'logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt' (epoch 2)
2022-09-27,11:50:50 | INFO | Start epoch 2
2022-09-27,11:51:02 | INFO | Train Epoch: 2 [    8192/20054016 (0%)] Loss: 2.7975 (2.798) Data (t): 2.658 Batch (t): 12.794, 640.301/s LR: 0.000498 Logit Scale: 41.771
2022-09-27,11:51:03 | INFO | Reducer buckets have been rebuilt in this iteration.
2022-09-27,11:51:49 | INFO | Train Epoch: 2 [  827392/20054016 (4%)] Loss: 3.5042 (3.151) Data (t): 0.073 Batch (t): 0.461, 17648.2/s LR: 0.000498 Logit Scale: 42.328
2022-09-27,11:52:35 | INFO | Train Epoch: 2 [ 1646592/20054016 (8%)] Loss: 2.9606 (3.087) Data (t): 0.073 Batch (t): 0.461, 17914.9/s LR: 0.000498 Logit Scale: 42.645
2022-09-27,11:53:21 | INFO | Train Epoch: 2 [ 2465792/20054016 (12%)] Loss: 2.9361 (3.050) Data (t): 0.073 Batch (t): 0.460, 17689.3/s LR: 0.000498 Logit Scale: 43.034
2022-09-27,11:54:07 | INFO | Train Epoch: 2 [ 3284992/20054016 (16%)] Loss: 2.8919 (3.018) Data (t): 0.073 Batch (t): 0.462, 17698.4/s LR: 0.000498 Logit Scale: 43.372
2022-09-27,11:54:53 | INFO | Train Epoch: 2 [ 4104192/20054016 (20%)] Loss: 3.0383 (3.021) Data (t): 0.073 Batch (t): 0.460, 17500.3/s LR: 0.000498 Logit Scale: 43.761
2022-09-27,11:55:39 | INFO | Train Epoch: 2 [ 4923392/20054016 (25%)] Loss: 3.0318 (3.023) Data (t): 0.073 Batch (t): 0.461, 17537.4/s LR: 0.000497 Logit Scale: 44.074
2022-09-27,11:56:25 | INFO | Train Epoch: 2 [ 5742592/20054016 (29%)] Loss: 3.0539 (3.027) Data (t): 0.074 Batch (t): 0.461, 17437.9/s LR: 0.000497 Logit Scale: 44.365
2022-09-27,11:57:11 | INFO | Train Epoch: 2 [ 6561792/20054016 (33%)] Loss: 2.8991 (3.013) Data (t): 0.073 Batch (t): 0.460, 17586.9/s LR: 0.000497 Logit Scale: 44.653
2022-09-27,11:57:57 | INFO | Train Epoch: 2 [ 7380992/20054016 (37%)] Loss: 2.8849 (3.000) Data (t): 0.073 Batch (t): 0.461, 17781.6/s LR: 0.000497 Logit Scale: 44.939
2022-09-27,11:58:43 | INFO | Train Epoch: 2 [ 8200192/20054016 (41%)] Loss: 2.9602 (2.996) Data (t): 0.073 Batch (t): 0.462, 17410.0/s LR: 0.000497 Logit Scale: 45.274
2022-09-27,11:59:30 | INFO | Train Epoch: 2 [ 9019392/20054016 (45%)] Loss: 3.0347 (2.999) Data (t): 0.073 Batch (t): 0.461, 17987.2/s LR: 0.000497 Logit Scale: 45.578
2022-09-27,12:00:16 | INFO | Train Epoch: 2 [ 9838592/20054016 (49%)] Loss: 2.8910 (2.991) Data (t): 0.073 Batch (t): 0.462, 17896.4/s LR: 0.000496 Logit Scale: 45.806
2022-09-27,12:01:02 | INFO | Train Epoch: 2 [10657792/20054016 (53%)] Loss: 3.0885 (2.998) Data (t): 0.073 Batch (t): 0.460, 17844.2/s LR: 0.000496 Logit Scale: 46.044
2022-09-27,12:01:48 | INFO | Train Epoch: 2 [11476992/20054016 (57%)] Loss: 2.8017 (2.985) Data (t): 0.074 Batch (t): 0.462, 17944.8/s LR: 0.000496 Logit Scale: 46.305
2022-09-27,12:02:34 | INFO | Train Epoch: 2 [12296192/20054016 (61%)] Loss: 2.7566 (2.971) Data (t): 0.073 Batch (t): 0.460, 17502.2/s LR: 0.000496 Logit Scale: 46.519
2022-09-27,12:03:20 | INFO | Train Epoch: 2 [13115392/20054016 (65%)] Loss: 2.8444 (2.963) Data (t): 0.074 Batch (t): 0.462, 17874.0/s LR: 0.000496 Logit Scale: 46.706
2022-09-27,12:04:06 | INFO | Train Epoch: 2 [13934592/20054016 (69%)] Loss: 2.6265 (2.945) Data (t): 0.073 Batch (t): 0.462, 17448.2/s LR: 0.000496 Logit Scale: 46.888
2022-09-27,12:04:53 | INFO | Train Epoch: 2 [14753792/20054016 (74%)] Loss: 2.9124 (2.943) Data (t): 0.073 Batch (t): 0.462, 17540.4/s LR: 0.000495 Logit Scale: 47.111
2022-09-27,12:05:39 | INFO | Train Epoch: 2 [15572992/20054016 (78%)] Loss: 2.8231 (2.937) Data (t): 0.073 Batch (t): 0.461, 18001.5/s LR: 0.000495 Logit Scale: 47.402
2022-09-27,12:06:25 | INFO | Train Epoch: 2 [16392192/20054016 (82%)] Loss: 2.6433 (2.923) Data (t): 0.074 Batch (t): 0.462, 17880.8/s LR: 0.000495 Logit Scale: 47.560
2022-09-27,12:07:11 | INFO | Train Epoch: 2 [17211392/20054016 (86%)] Loss: 2.7611 (2.916) Data (t): 0.074 Batch (t): 0.464, 17470.8/s LR: 0.000495 Logit Scale: 47.772
2022-09-27,12:07:58 | INFO | Train Epoch: 2 [18030592/20054016 (90%)] Loss: 2.6497 (2.904) Data (t): 0.073 Batch (t): 0.461, 17497.8/s LR: 0.000495 Logit Scale: 47.934
2022-09-27,12:08:44 | INFO | Train Epoch: 2 [18849792/20054016 (94%)] Loss: 2.8205 (2.900) Data (t): 0.074 Batch (t): 0.462, 17784.4/s LR: 0.000494 Logit Scale: 48.150
2022-09-27,12:09:31 | INFO | Train Epoch: 2 [19668992/20054016 (98%)] Loss: 2.6630 (2.891) Data (t): 0.077 Batch (t): 0.468, 17238.0/s LR: 0.000494 Logit Scale: 48.344
2022-09-27,12:09:52 | INFO | Train Epoch: 2 [20054016/20054016 (100%)] Loss: 2.4420 (2.874) Data (t): 0.075 Batch (t): 0.462, 18285.9/s LR: 0.000494 Logit Scale: 48.412
2022-09-27,12:09:52 | INFO | Starting zero-shot imagenet.
2022-09-27,12:09:52 | INFO | Building zero-shot classifier
2022-09-27,12:10:47 | INFO | Using classifier
2022-09-27,12:13:14 | INFO | Finished zero-shot imagenet.
2022-09-27,12:13:14 | INFO | Eval Epoch: 3 imagenet-zeroshot-val-top1: 0.2043	imagenet-zeroshot-val-top5: 0.4244
2022-09-27,12:13:18 | INFO | Start epoch 3
2022-09-27,12:13:20 | INFO | Train Epoch: 3 [    8192/20054016 (0%)] Loss: 2.2881 (2.288) Data (t): 1.630 Batch (t): 2.023, 4050.13/s LR: 0.000494 Logit Scale: 48.422

## results.jsonl
{"imagenet-zeroshot-val-top1": 0.07508, "imagenet-zeroshot-val-top5": 0.19812}
{"imagenet-zeroshot-val-top1": 0.16212, "imagenet-zeroshot-val-top5": 0.35246}
{"imagenet-zeroshot-val-top1": 0.2043, "imagenet-zeroshot-val-top5": 0.42438}
	#!/bin/bash -x
	#SBATCH --account=cstdl
	#SBATCH --nodes=8
	#SBATCH --gres=gpu:4
	#SBATCH --ntasks-per-node=4
	#SBATCH --cpus-per-task=12
	#SBATCH --wait-all-nodes=1
	#SBATCH --time=00:30:00
	#SBATCH --partition=batch
	#SBATCH --job-name=open_clip

	# load low-level libraries
	ml purge
	ml use $OTHERSTAGES
	ml Stages/2022
	ml GCC/11.2.0
	ml OpenMPI/4.1.2
	ml CUDA/11.5
	ml cuDNN/8.3.1.22-CUDA-11.5
	ml NCCL/2.12.7-1-CUDA-11.5
	ml PyTorch/1.11-CUDA-11.5
	ml torchvision/0.12.0
	source envs/hdfml/bin/activate

	#export NCCL_DEBUG=INFO
	#export NCCL_DEBUG_SUBSYS=ALL
	export CUDA_VISIBLE_DEVICES=0,1,2,3
	export MASTER_PORT=12802
	### get the first node name as master address - customized for vgg slurm
	### e.g. master(gnodee[2-5],gnoded1) == gnodee2
	master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export MASTER_ADDR=$master_addr"i"
	echo "MASTER_ADDR="$MASTER_ADDR
	export PYTHONPATH="$PYTHONPATH:$PWD/src"

	LOGS=logs/example
	NAME=ViT-B-32-20M
	CKPT="$LOGS/$NAME/checkpoints/epoch_latest.pt"
	if test -f "$CKPT"; then
	RESUME="--resume $CKPT"
	else
	RESUME=""
	fi
	srun --cpu_bind=v --accel-bind=gn python -u src/training/main.py \
	--save-frequency 1 \
	--zeroshot-frequency 1 \
	--train-data="/p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar" \
	--imagenet-val="/p/scratch/ccstdl/gordon2/imagenet_val" \
	--train-num-samples=20003822 \
	--warmup 2000 \
	--batch-size=256 \
	--epochs=32 \
	--workers=8 \
	--report-to=tensorboard \
	--model ViT-B-32 \
	--name $NAME \
	--logs $LOGS \
	--seed 0 \
	--lr 5.0e-4 \
	--ddp-static-graph \
	--local-loss \
	--gather-with-grad \
	--save-most-recent \
	$RESUME
	{"imagenet-zeroshot-val-top1": 0.07508, "imagenet-zeroshot-val-top5": 0.19812}
	{"imagenet-zeroshot-val-top1": 0.16212, "imagenet-zeroshot-val-top5": 0.35246}
	{"imagenet-zeroshot-val-top1": 0.2043, "imagenet-zeroshot-val-top5": 0.42438}