Created
September 27, 2022 10:17
-
-
Save mehdidc/106b443c8e0dbb711dacc8a8600c2839 to your computer and use it in GitHub Desktop.
Content of the files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -x | |
#SBATCH --account=cstdl | |
#SBATCH --nodes=8 | |
#SBATCH --gres=gpu:4 | |
#SBATCH --ntasks-per-node=4 | |
#SBATCH --cpus-per-task=12 | |
#SBATCH --wait-all-nodes=1 | |
#SBATCH --time=00:30:00 | |
#SBATCH --partition=batch | |
#SBATCH --job-name=open_clip | |
# load low-level libraries | |
ml purge | |
ml use $OTHERSTAGES | |
ml Stages/2022 | |
ml GCC/11.2.0 | |
ml OpenMPI/4.1.2 | |
ml CUDA/11.5 | |
ml cuDNN/8.3.1.22-CUDA-11.5 | |
ml NCCL/2.12.7-1-CUDA-11.5 | |
ml PyTorch/1.11-CUDA-11.5 | |
ml torchvision/0.12.0 | |
source envs/hdfml/bin/activate | |
#export NCCL_DEBUG=INFO | |
#export NCCL_DEBUG_SUBSYS=ALL | |
export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
export MASTER_PORT=12802 | |
### get the first node name as master address - customized for vgg slurm | |
### e.g. master(gnodee[2-5],gnoded1) == gnodee2 | |
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export MASTER_ADDR=$master_addr"i" | |
echo "MASTER_ADDR="$MASTER_ADDR | |
export PYTHONPATH="$PYTHONPATH:$PWD/src" | |
LOGS=logs/example | |
NAME=ViT-B-32-20M | |
CKPT="$LOGS/$NAME/checkpoints/epoch_latest.pt" | |
if test -f "$CKPT"; then | |
RESUME="--resume $CKPT" | |
else | |
RESUME="" | |
fi | |
srun --cpu_bind=v --accel-bind=gn python -u src/training/main.py \ | |
--save-frequency 1 \ | |
--zeroshot-frequency 1 \ | |
--train-data="/p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar" \ | |
--imagenet-val="/p/scratch/ccstdl/gordon2/imagenet_val" \ | |
--train-num-samples=20003822 \ | |
--warmup 2000 \ | |
--batch-size=256 \ | |
--epochs=32 \ | |
--workers=8 \ | |
--report-to=tensorboard \ | |
--model ViT-B-32 \ | |
--name $NAME \ | |
--logs $LOGS \ | |
--seed 0 \ | |
--lr 5.0e-4 \ | |
--ddp-static-graph \ | |
--local-loss \ | |
--gather-with-grad \ | |
--save-most-recent \ | |
$RESUME |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2022-09-27,10:52:02 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0 | |
2022-09-27,10:52:02 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes. | |
2022-09-27,10:52:02 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32. | |
2022-09-27,10:52:02 | INFO | Loading ViT-B-32 model config. | |
2022-09-27,10:52:05 | INFO | Model: | |
2022-09-27,10:52:05 | INFO | CLIP( | |
(visual): VisualTransformer( | |
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False) | |
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
) | |
) | |
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
) | |
) | |
(token_embedding): Embedding(49408, 512) | |
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
) | |
2022-09-27,10:52:05 | INFO | Params: | |
2022-09-27,10:52:05 | INFO | batch_size: 256 | |
2022-09-27,10:52:05 | INFO | beta1: 0.9 | |
2022-09-27,10:52:05 | INFO | beta2: 0.98 | |
2022-09-27,10:52:05 | INFO | checkpoint_path: logs/example/ViT-B-32-20M/checkpoints | |
2022-09-27,10:52:05 | INFO | copy_codebase: False | |
2022-09-27,10:52:05 | INFO | csv_caption_key: title | |
2022-09-27,10:52:05 | INFO | csv_img_key: filepath | |
2022-09-27,10:52:05 | INFO | csv_separator: | |
2022-09-27,10:52:05 | INFO | dataset_resampled: False | |
2022-09-27,10:52:05 | INFO | dataset_type: auto | |
2022-09-27,10:52:05 | INFO | ddp_static_graph: True | |
2022-09-27,10:52:05 | INFO | debug: False | |
2022-09-27,10:52:05 | INFO | device: cuda:0 | |
2022-09-27,10:52:05 | INFO | dist_backend: nccl | |
2022-09-27,10:52:05 | INFO | dist_url: env:// | |
2022-09-27,10:52:05 | INFO | distributed: True | |
2022-09-27,10:52:05 | INFO | epochs: 32 | |
2022-09-27,10:52:05 | INFO | eps: 1e-06 | |
2022-09-27,10:52:05 | INFO | force_quick_gelu: False | |
2022-09-27,10:52:05 | INFO | gather_with_grad: True | |
2022-09-27,10:52:05 | INFO | grad_checkpointing: False | |
2022-09-27,10:52:05 | INFO | horovod: False | |
2022-09-27,10:52:05 | INFO | image_mean: None | |
2022-09-27,10:52:05 | INFO | image_std: None | |
2022-09-27,10:52:05 | INFO | imagenet_v2: None | |
2022-09-27,10:52:05 | INFO | imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val | |
2022-09-27,10:52:05 | INFO | local_loss: True | |
2022-09-27,10:52:05 | INFO | local_rank: 0 | |
2022-09-27,10:52:05 | INFO | lock_image: False | |
2022-09-27,10:52:05 | INFO | lock_image_freeze_bn_stats: False | |
2022-09-27,10:52:05 | INFO | lock_image_unlocked_groups: 0 | |
2022-09-27,10:52:05 | INFO | log_level: 20 | |
2022-09-27,10:52:05 | INFO | log_local: False | |
2022-09-27,10:52:05 | INFO | log_path: logs/example/ViT-B-32-20M/out.log | |
2022-09-27,10:52:05 | INFO | logs: logs/example | |
2022-09-27,10:52:05 | INFO | lr: 0.0005 | |
2022-09-27,10:52:05 | INFO | model: ViT-B-32 | |
2022-09-27,10:52:05 | INFO | name: ViT-B-32-20M | |
2022-09-27,10:52:05 | INFO | no_set_device_rank: False | |
2022-09-27,10:52:05 | INFO | norm_gradient_clip: None | |
2022-09-27,10:52:05 | INFO | precision: amp | |
2022-09-27,10:52:05 | INFO | pretrained: | |
2022-09-27,10:52:05 | INFO | pretrained_image: False | |
2022-09-27,10:52:05 | INFO | rank: 0 | |
2022-09-27,10:52:05 | INFO | report_to: tensorboard | |
2022-09-27,10:52:05 | INFO | resume: None | |
2022-09-27,10:52:05 | INFO | save_frequency: 1 | |
2022-09-27,10:52:05 | INFO | save_most_recent: True | |
2022-09-27,10:52:05 | INFO | seed: 0 | |
2022-09-27,10:52:05 | INFO | skip_scheduler: False | |
2022-09-27,10:52:05 | INFO | tensorboard: True | |
2022-09-27,10:52:05 | INFO | tensorboard_path: logs/example/ViT-B-32-20M/tensorboard | |
2022-09-27,10:52:05 | INFO | torchscript: False | |
2022-09-27,10:52:05 | INFO | trace: False | |
2022-09-27,10:52:05 | INFO | train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar | |
2022-09-27,10:52:05 | INFO | train_num_samples: 20003822 | |
2022-09-27,10:52:05 | INFO | use_bn_sync: False | |
2022-09-27,10:52:05 | INFO | val_data: None | |
2022-09-27,10:52:05 | INFO | val_frequency: 1 | |
2022-09-27,10:52:05 | INFO | val_num_samples: None | |
2022-09-27,10:52:05 | INFO | wandb: False | |
2022-09-27,10:52:05 | INFO | wandb_notes: | |
2022-09-27,10:52:05 | INFO | warmup: 2000 | |
2022-09-27,10:52:05 | INFO | wd: 0.2 | |
2022-09-27,10:52:05 | INFO | workers: 8 | |
2022-09-27,10:52:05 | INFO | world_size: 32 | |
2022-09-27,10:52:05 | INFO | zeroshot_frequency: 1 | |
2022-09-27,10:52:06 | INFO | Start epoch 0 | |
2022-09-27,10:52:18 | INFO | Train Epoch: 0 [ 8192/20054016 (0%)] Loss: 9.0813 (9.081) Data (t): 3.241 Batch (t): 12.323, 664.795/s LR: 0.000000 Logit Scale: 14.286 | |
2022-09-27,10:52:19 | INFO | Reducer buckets have been rebuilt in this iteration. | |
2022-09-27,10:53:04 | INFO | Train Epoch: 0 [ 827392/20054016 (4%)] Loss: 8.8624 (8.972) Data (t): 0.073 Batch (t): 0.458, 18055.6/s LR: 0.000025 Logit Scale: 14.283 | |
2022-09-27,10:53:50 | INFO | Train Epoch: 0 [ 1646592/20054016 (8%)] Loss: 8.3184 (8.754) Data (t): 0.074 Batch (t): 0.459, 17636.6/s LR: 0.000050 Logit Scale: 14.282 | |
2022-09-27,10:54:36 | INFO | Train Epoch: 0 [ 2465792/20054016 (12%)] Loss: 8.0655 (8.582) Data (t): 0.074 Batch (t): 0.460, 17725.1/s LR: 0.000075 Logit Scale: 14.297 | |
2022-09-27,10:55:22 | INFO | Train Epoch: 0 [ 3284992/20054016 (16%)] Loss: 7.6955 (8.405) Data (t): 0.073 Batch (t): 0.459, 17984.9/s LR: 0.000100 Logit Scale: 14.367 | |
2022-09-27,10:56:08 | INFO | Train Epoch: 0 [ 4104192/20054016 (20%)] Loss: 7.6081 (8.272) Data (t): 0.074 Batch (t): 0.459, 17907.1/s LR: 0.000125 Logit Scale: 14.476 | |
2022-09-27,10:56:54 | INFO | Train Epoch: 0 [ 4923392/20054016 (25%)] Loss: 7.2475 (8.126) Data (t): 0.074 Batch (t): 0.460, 17617.1/s LR: 0.000150 Logit Scale: 14.633 | |
2022-09-27,10:57:40 | INFO | Train Epoch: 0 [ 5742592/20054016 (29%)] Loss: 6.9686 (7.981) Data (t): 0.073 Batch (t): 0.460, 18007.0/s LR: 0.000175 Logit Scale: 14.818 | |
2022-09-27,10:58:26 | INFO | Train Epoch: 0 [ 6561792/20054016 (33%)] Loss: 6.7461 (7.844) Data (t): 0.074 Batch (t): 0.460, 17910.2/s LR: 0.000200 Logit Scale: 15.067 | |
2022-09-27,10:59:11 | INFO | Train Epoch: 0 [ 7380992/20054016 (37%)] Loss: 6.8749 (7.747) Data (t): 0.073 Batch (t): 0.459, 17859.4/s LR: 0.000225 Logit Scale: 15.353 | |
2022-09-27,10:59:58 | INFO | Train Epoch: 0 [ 8200192/20054016 (41%)] Loss: 6.6846 (7.650) Data (t): 0.074 Batch (t): 0.461, 17568.7/s LR: 0.000250 Logit Scale: 15.703 | |
2022-09-27,11:00:44 | INFO | Train Epoch: 0 [ 9019392/20054016 (45%)] Loss: 6.3567 (7.542) Data (t): 0.074 Batch (t): 0.460, 18002.7/s LR: 0.000275 Logit Scale: 16.078 | |
2022-09-27,11:01:30 | INFO | Train Epoch: 0 [ 9838592/20054016 (49%)] Loss: 6.2377 (7.442) Data (t): 0.074 Batch (t): 0.459, 17845.4/s LR: 0.000300 Logit Scale: 16.525 | |
2022-09-27,11:02:15 | INFO | Train Epoch: 0 [10657792/20054016 (53%)] Loss: 6.1121 (7.347) Data (t): 0.073 Batch (t): 0.460, 18188.0/s LR: 0.000325 Logit Scale: 17.043 | |
2022-09-27,11:04:04 | INFO | Train Epoch: 0 [11476992/20054016 (57%)] Loss: 5.9579 (7.254) Data (t): 0.074 Batch (t): 1.088, 17697.1/s LR: 0.000350 Logit Scale: 17.598 | |
2022-09-27,11:04:50 | INFO | Train Epoch: 0 [12296192/20054016 (61%)] Loss: 5.4793 (7.144) Data (t): 0.074 Batch (t): 0.459, 17887.8/s LR: 0.000375 Logit Scale: 18.224 | |
2022-09-27,11:05:36 | INFO | Train Epoch: 0 [13115392/20054016 (65%)] Loss: 5.6995 (7.059) Data (t): 0.074 Batch (t): 0.460, 17601.5/s LR: 0.000400 Logit Scale: 18.905 | |
2022-09-27,11:06:22 | INFO | Train Epoch: 0 [13934592/20054016 (69%)] Loss: 5.6713 (6.982) Data (t): 0.073 Batch (t): 0.459, 18069.0/s LR: 0.000425 Logit Scale: 19.648 | |
2022-09-27,11:07:08 | INFO | Train Epoch: 0 [14753792/20054016 (74%)] Loss: 5.4377 (6.900) Data (t): 0.074 Batch (t): 0.461, 17819.1/s LR: 0.000450 Logit Scale: 20.300 | |
2022-09-27,11:07:54 | INFO | Train Epoch: 0 [15572992/20054016 (78%)] Loss: 5.2516 (6.818) Data (t): 0.074 Batch (t): 0.460, 18008.3/s LR: 0.000475 Logit Scale: 21.194 | |
2022-09-27,11:08:40 | INFO | Train Epoch: 0 [16392192/20054016 (82%)] Loss: 5.0659 (6.734) Data (t): 0.074 Batch (t): 0.461, 17626.9/s LR: 0.000500 Logit Scale: 22.089 | |
2022-09-27,11:09:27 | INFO | Train Epoch: 0 [17211392/20054016 (86%)] Loss: 5.2005 (6.665) Data (t): 0.073 Batch (t): 0.464, 17665.2/s LR: 0.000500 Logit Scale: 23.083 | |
2022-09-27,11:10:13 | INFO | Train Epoch: 0 [18030592/20054016 (90%)] Loss: 5.1313 (6.598) Data (t): 0.073 Batch (t): 0.461, 17438.2/s LR: 0.000500 Logit Scale: 24.062 | |
2022-09-27,11:10:59 | INFO | Train Epoch: 0 [18849792/20054016 (94%)] Loss: 4.8736 (6.526) Data (t): 0.074 Batch (t): 0.462, 17608.1/s LR: 0.000500 Logit Scale: 25.103 | |
2022-09-27,11:11:46 | INFO | Train Epoch: 0 [19668992/20054016 (98%)] Loss: 4.9132 (6.462) Data (t): 0.075 Batch (t): 0.467, 17932.5/s LR: 0.000500 Logit Scale: 26.143 | |
2022-09-27,11:12:07 | INFO | Train Epoch: 0 [20054016/20054016 (100%)] Loss: 4.7750 (6.397) Data (t): 0.074 Batch (t): 0.462, 18358.2/s LR: 0.000500 Logit Scale: 26.637 | |
2022-09-27,11:12:07 | INFO | Starting zero-shot imagenet. | |
2022-09-27,11:12:07 | INFO | Building zero-shot classifier | |
2022-09-27,11:13:02 | INFO | Using classifier | |
2022-09-27,11:15:48 | INFO | Finished zero-shot imagenet. | |
2022-09-27,11:15:48 | INFO | Eval Epoch: 1 imagenet-zeroshot-val-top1: 0.0751 imagenet-zeroshot-val-top5: 0.1981 | |
2022-09-27,11:15:52 | INFO | Start epoch 1 | |
2022-09-27,11:15:55 | INFO | Train Epoch: 1 [ 8192/20054016 (0%)] Loss: 4.4730 (4.473) Data (t): 2.671 Batch (t): 3.057, 2679.81/s LR: 0.000500 Logit Scale: 26.647 | |
2022-09-27,11:20:41 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0 | |
2022-09-27,11:20:41 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes. | |
2022-09-27,11:20:41 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32. | |
2022-09-27,11:20:41 | INFO | Loading ViT-B-32 model config. | |
2022-09-27,11:20:43 | INFO | Model: | |
2022-09-27,11:20:43 | INFO | CLIP( | |
(visual): VisualTransformer( | |
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False) | |
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
) | |
) | |
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
) | |
) | |
(token_embedding): Embedding(49408, 512) | |
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
) | |
2022-09-27,11:20:43 | INFO | Params: | |
2022-09-27,11:20:43 | INFO | batch_size: 256 | |
2022-09-27,11:20:43 | INFO | beta1: 0.9 | |
2022-09-27,11:20:43 | INFO | beta2: 0.98 | |
2022-09-27,11:20:43 | INFO | checkpoint_path: logs/example/ViT-B-32-20M/checkpoints | |
2022-09-27,11:20:43 | INFO | copy_codebase: False | |
2022-09-27,11:20:43 | INFO | csv_caption_key: title | |
2022-09-27,11:20:43 | INFO | csv_img_key: filepath | |
2022-09-27,11:20:43 | INFO | csv_separator: | |
2022-09-27,11:20:43 | INFO | dataset_resampled: False | |
2022-09-27,11:20:43 | INFO | dataset_type: auto | |
2022-09-27,11:20:43 | INFO | ddp_static_graph: True | |
2022-09-27,11:20:43 | INFO | debug: False | |
2022-09-27,11:20:43 | INFO | device: cuda:0 | |
2022-09-27,11:20:43 | INFO | dist_backend: nccl | |
2022-09-27,11:20:43 | INFO | dist_url: env:// | |
2022-09-27,11:20:43 | INFO | distributed: True | |
2022-09-27,11:20:43 | INFO | epochs: 32 | |
2022-09-27,11:20:43 | INFO | eps: 1e-06 | |
2022-09-27,11:20:43 | INFO | force_quick_gelu: False | |
2022-09-27,11:20:43 | INFO | gather_with_grad: True | |
2022-09-27,11:20:43 | INFO | grad_checkpointing: False | |
2022-09-27,11:20:43 | INFO | horovod: False | |
2022-09-27,11:20:43 | INFO | image_mean: None | |
2022-09-27,11:20:43 | INFO | image_std: None | |
2022-09-27,11:20:43 | INFO | imagenet_v2: None | |
2022-09-27,11:20:43 | INFO | imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val | |
2022-09-27,11:20:43 | INFO | local_loss: True | |
2022-09-27,11:20:43 | INFO | local_rank: 0 | |
2022-09-27,11:20:43 | INFO | lock_image: False | |
2022-09-27,11:20:43 | INFO | lock_image_freeze_bn_stats: False | |
2022-09-27,11:20:43 | INFO | lock_image_unlocked_groups: 0 | |
2022-09-27,11:20:43 | INFO | log_level: 20 | |
2022-09-27,11:20:43 | INFO | log_local: False | |
2022-09-27,11:20:43 | INFO | log_path: logs/example/ViT-B-32-20M/out.log | |
2022-09-27,11:20:43 | INFO | logs: logs/example | |
2022-09-27,11:20:43 | INFO | lr: 0.0005 | |
2022-09-27,11:20:43 | INFO | model: ViT-B-32 | |
2022-09-27,11:20:43 | INFO | name: ViT-B-32-20M | |
2022-09-27,11:20:43 | INFO | no_set_device_rank: False | |
2022-09-27,11:20:43 | INFO | norm_gradient_clip: None | |
2022-09-27,11:20:43 | INFO | precision: amp | |
2022-09-27,11:20:43 | INFO | pretrained: | |
2022-09-27,11:20:43 | INFO | pretrained_image: False | |
2022-09-27,11:20:43 | INFO | rank: 0 | |
2022-09-27,11:20:43 | INFO | report_to: tensorboard | |
2022-09-27,11:20:43 | INFO | resume: logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt | |
2022-09-27,11:20:43 | INFO | save_frequency: 1 | |
2022-09-27,11:20:43 | INFO | save_most_recent: True | |
2022-09-27,11:20:43 | INFO | seed: 0 | |
2022-09-27,11:20:43 | INFO | skip_scheduler: False | |
2022-09-27,11:20:43 | INFO | tensorboard: True | |
2022-09-27,11:20:43 | INFO | tensorboard_path: logs/example/ViT-B-32-20M/tensorboard | |
2022-09-27,11:20:43 | INFO | torchscript: False | |
2022-09-27,11:20:43 | INFO | trace: False | |
2022-09-27,11:20:43 | INFO | train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar | |
2022-09-27,11:20:43 | INFO | train_num_samples: 20003822 | |
2022-09-27,11:20:43 | INFO | use_bn_sync: False | |
2022-09-27,11:20:43 | INFO | val_data: None | |
2022-09-27,11:20:43 | INFO | val_frequency: 1 | |
2022-09-27,11:20:43 | INFO | val_num_samples: None | |
2022-09-27,11:20:43 | INFO | wandb: False | |
2022-09-27,11:20:43 | INFO | wandb_notes: | |
2022-09-27,11:20:43 | INFO | warmup: 2000 | |
2022-09-27,11:20:43 | INFO | wd: 0.2 | |
2022-09-27,11:20:43 | INFO | workers: 8 | |
2022-09-27,11:20:43 | INFO | world_size: 32 | |
2022-09-27,11:20:43 | INFO | zeroshot_frequency: 1 | |
2022-09-27,11:20:48 | INFO | => resuming checkpoint 'logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt' (epoch 1) | |
2022-09-27,11:20:48 | INFO | Start epoch 1 | |
2022-09-27,11:21:00 | INFO | Train Epoch: 1 [ 8192/20054016 (0%)] Loss: 4.7262 (4.726) Data (t): 3.628 Batch (t): 12.326, 664.617/s LR: 0.000500 Logit Scale: 26.647 | |
2022-09-27,11:21:01 | INFO | Reducer buckets have been rebuilt in this iteration. | |
2022-09-27,11:21:46 | INFO | Train Epoch: 1 [ 827392/20054016 (4%)] Loss: 4.4406 (4.583) Data (t): 0.072 Batch (t): 0.460, 18013.3/s LR: 0.000500 Logit Scale: 27.731 | |
2022-09-27,11:22:32 | INFO | Train Epoch: 1 [ 1646592/20054016 (8%)] Loss: 4.3635 (4.510) Data (t): 0.073 Batch (t): 0.460, 17848.4/s LR: 0.000500 Logit Scale: 28.743 | |
2022-09-27,11:23:19 | INFO | Train Epoch: 1 [ 2465792/20054016 (12%)] Loss: 4.6087 (4.535) Data (t): 0.072 Batch (t): 0.461, 17616.1/s LR: 0.000500 Logit Scale: 29.586 | |
2022-09-27,11:24:05 | INFO | Train Epoch: 1 [ 3284992/20054016 (16%)] Loss: 4.5415 (4.536) Data (t): 0.073 Batch (t): 0.461, 17434.1/s LR: 0.000500 Logit Scale: 30.362 | |
2022-09-27,11:24:51 | INFO | Train Epoch: 1 [ 4104192/20054016 (20%)] Loss: 4.3388 (4.503) Data (t): 0.073 Batch (t): 0.461, 17804.6/s LR: 0.000500 Logit Scale: 31.230 | |
2022-09-27,11:25:37 | INFO | Train Epoch: 1 [ 4923392/20054016 (25%)] Loss: 4.2594 (4.468) Data (t): 0.073 Batch (t): 0.460, 17900.5/s LR: 0.000500 Logit Scale: 31.951 | |
2022-09-27,11:26:23 | INFO | Train Epoch: 1 [ 5742592/20054016 (29%)] Loss: 4.1226 (4.425) Data (t): 0.073 Batch (t): 0.462, 17497.4/s LR: 0.000500 Logit Scale: 32.769 | |
2022-09-27,11:27:09 | INFO | Train Epoch: 1 [ 6561792/20054016 (33%)] Loss: 3.9446 (4.372) Data (t): 0.073 Batch (t): 0.461, 17653.3/s LR: 0.000500 Logit Scale: 33.478 | |
2022-09-27,11:27:55 | INFO | Train Epoch: 1 [ 7380992/20054016 (37%)] Loss: 3.9022 (4.325) Data (t): 0.073 Batch (t): 0.461, 17965.8/s LR: 0.000500 Logit Scale: 34.093 | |
2022-09-27,11:28:41 | INFO | Train Epoch: 1 [ 8200192/20054016 (41%)] Loss: 3.9148 (4.288) Data (t): 0.073 Batch (t): 0.462, 18020.7/s LR: 0.000500 Logit Scale: 34.800 | |
2022-09-27,11:29:28 | INFO | Train Epoch: 1 [ 9019392/20054016 (45%)] Loss: 3.8163 (4.248) Data (t): 0.073 Batch (t): 0.461, 17457.7/s LR: 0.000499 Logit Scale: 35.438 | |
2022-09-27,11:30:14 | INFO | Train Epoch: 1 [ 9838592/20054016 (49%)] Loss: 3.7082 (4.207) Data (t): 0.073 Batch (t): 0.462, 17738.3/s LR: 0.000499 Logit Scale: 36.000 | |
2022-09-27,11:31:00 | INFO | Train Epoch: 1 [10657792/20054016 (53%)] Loss: 3.4084 (4.150) Data (t): 0.074 Batch (t): 0.462, 17661.3/s LR: 0.000499 Logit Scale: 36.571 | |
2022-09-27,11:31:46 | INFO | Train Epoch: 1 [11476992/20054016 (57%)] Loss: 3.5990 (4.113) Data (t): 0.073 Batch (t): 0.461, 17817.6/s LR: 0.000499 Logit Scale: 37.060 | |
2022-09-27,11:32:32 | INFO | Train Epoch: 1 [12296192/20054016 (61%)] Loss: 3.8248 (4.095) Data (t): 0.073 Batch (t): 0.461, 17991.9/s LR: 0.000499 Logit Scale: 37.553 | |
2022-09-27,11:33:18 | INFO | Train Epoch: 1 [13115392/20054016 (65%)] Loss: 3.3610 (4.052) Data (t): 0.073 Batch (t): 0.462, 17919.3/s LR: 0.000499 Logit Scale: 38.000 | |
2022-09-27,11:34:05 | INFO | Train Epoch: 1 [13934592/20054016 (69%)] Loss: 3.5046 (4.021) Data (t): 0.073 Batch (t): 0.463, 17606.2/s LR: 0.000499 Logit Scale: 38.528 | |
2022-09-27,11:34:51 | INFO | Train Epoch: 1 [14753792/20054016 (74%)] Loss: 3.5596 (3.997) Data (t): 0.073 Batch (t): 0.461, 17863.1/s LR: 0.000499 Logit Scale: 39.018 | |
2022-09-27,11:35:37 | INFO | Train Epoch: 1 [15572992/20054016 (78%)] Loss: 3.5188 (3.973) Data (t): 0.073 Batch (t): 0.462, 17523.2/s LR: 0.000499 Logit Scale: 39.465 | |
2022-09-27,11:36:23 | INFO | Train Epoch: 1 [16392192/20054016 (82%)] Loss: 3.1615 (3.935) Data (t): 0.073 Batch (t): 0.461, 17629.2/s LR: 0.000499 Logit Scale: 39.938 | |
2022-09-27,11:37:09 | INFO | Train Epoch: 1 [17211392/20054016 (86%)] Loss: 3.2076 (3.901) Data (t): 0.074 Batch (t): 0.463, 17572.2/s LR: 0.000499 Logit Scale: 40.329 | |
2022-09-27,11:37:56 | INFO | Train Epoch: 1 [18030592/20054016 (90%)] Loss: 3.1250 (3.868) Data (t): 0.073 Batch (t): 0.462, 17993.2/s LR: 0.000499 Logit Scale: 40.778 | |
2022-09-27,11:38:42 | INFO | Train Epoch: 1 [18849792/20054016 (94%)] Loss: 3.4095 (3.849) Data (t): 0.074 Batch (t): 0.463, 17656.7/s LR: 0.000498 Logit Scale: 41.177 | |
2022-09-27,11:39:29 | INFO | Train Epoch: 1 [19668992/20054016 (98%)] Loss: 3.0969 (3.819) Data (t): 0.076 Batch (t): 0.470, 17204.0/s LR: 0.000498 Logit Scale: 41.611 | |
2022-09-27,11:39:51 | INFO | Train Epoch: 1 [20054016/20054016 (100%)] Loss: 3.1594 (3.793) Data (t): 0.075 Batch (t): 0.464, 18190.3/s LR: 0.000498 Logit Scale: 41.765 | |
2022-09-27,11:39:51 | INFO | Starting zero-shot imagenet. | |
2022-09-27,11:39:51 | INFO | Building zero-shot classifier | |
2022-09-27,11:40:45 | INFO | Using classifier | |
2022-09-27,11:43:39 | INFO | Finished zero-shot imagenet. | |
2022-09-27,11:43:39 | INFO | Eval Epoch: 2 imagenet-zeroshot-val-top1: 0.1621 imagenet-zeroshot-val-top5: 0.3525 | |
2022-09-27,11:43:43 | INFO | Start epoch 2 | |
2022-09-27,11:43:45 | INFO | Train Epoch: 2 [ 8192/20054016 (0%)] Loss: 2.3822 (2.382) Data (t): 1.644 Batch (t): 2.037, 4020.91/s LR: 0.000498 Logit Scale: 41.771 | |
2022-09-27,11:50:43 | INFO | Added key: store_based_barrier_key:1 to store for rank: 0 | |
2022-09-27,11:50:43 | INFO | Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 32 nodes. | |
2022-09-27,11:50:43 | INFO | Running in distributed mode with multiple processes. Device: cuda:0.Process (global: 0, local 0), total 32. | |
2022-09-27,11:50:43 | INFO | Loading ViT-B-32 model config. | |
2022-09-27,11:50:45 | INFO | Model: | |
2022-09-27,11:50:45 | INFO | CLIP( | |
(visual): VisualTransformer( | |
(conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False) | |
(ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=768, out_features=3072, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=3072, out_features=768, bias=True) | |
) | |
) | |
) | |
) | |
(ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True) | |
) | |
(transformer): Transformer( | |
(resblocks): ModuleList( | |
(0): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(1): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(2): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(3): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(4): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(5): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(6): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(7): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(8): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(9): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(10): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
(11): ResidualAttentionBlock( | |
(ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(attn): MultiheadAttention( | |
(out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True) | |
) | |
(ln_attn): Identity() | |
(ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
(mlp): Sequential( | |
(c_fc): Linear(in_features=512, out_features=2048, bias=True) | |
(ln): Identity() | |
(gelu): GELU() | |
(c_proj): Linear(in_features=2048, out_features=512, bias=True) | |
) | |
) | |
) | |
) | |
(token_embedding): Embedding(49408, 512) | |
(ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True) | |
) | |
2022-09-27,11:50:45 | INFO | Params: | |
2022-09-27,11:50:45 | INFO | batch_size: 256 | |
2022-09-27,11:50:45 | INFO | beta1: 0.9 | |
2022-09-27,11:50:45 | INFO | beta2: 0.98 | |
2022-09-27,11:50:45 | INFO | checkpoint_path: logs/example/ViT-B-32-20M/checkpoints | |
2022-09-27,11:50:45 | INFO | copy_codebase: False | |
2022-09-27,11:50:45 | INFO | csv_caption_key: title | |
2022-09-27,11:50:45 | INFO | csv_img_key: filepath | |
2022-09-27,11:50:45 | INFO | csv_separator: | |
2022-09-27,11:50:45 | INFO | dataset_resampled: False | |
2022-09-27,11:50:45 | INFO | dataset_type: auto | |
2022-09-27,11:50:45 | INFO | ddp_static_graph: True | |
2022-09-27,11:50:45 | INFO | debug: False | |
2022-09-27,11:50:45 | INFO | device: cuda:0 | |
2022-09-27,11:50:45 | INFO | dist_backend: nccl | |
2022-09-27,11:50:45 | INFO | dist_url: env:// | |
2022-09-27,11:50:45 | INFO | distributed: True | |
2022-09-27,11:50:45 | INFO | epochs: 32 | |
2022-09-27,11:50:45 | INFO | eps: 1e-06 | |
2022-09-27,11:50:45 | INFO | force_quick_gelu: False | |
2022-09-27,11:50:45 | INFO | gather_with_grad: True | |
2022-09-27,11:50:45 | INFO | grad_checkpointing: False | |
2022-09-27,11:50:45 | INFO | horovod: False | |
2022-09-27,11:50:45 | INFO | image_mean: None | |
2022-09-27,11:50:45 | INFO | image_std: None | |
2022-09-27,11:50:45 | INFO | imagenet_v2: None | |
2022-09-27,11:50:45 | INFO | imagenet_val: /p/scratch/ccstdl/gordon2/imagenet_val | |
2022-09-27,11:50:45 | INFO | local_loss: True | |
2022-09-27,11:50:45 | INFO | local_rank: 0 | |
2022-09-27,11:50:45 | INFO | lock_image: False | |
2022-09-27,11:50:45 | INFO | lock_image_freeze_bn_stats: False | |
2022-09-27,11:50:45 | INFO | lock_image_unlocked_groups: 0 | |
2022-09-27,11:50:45 | INFO | log_level: 20 | |
2022-09-27,11:50:45 | INFO | log_local: False | |
2022-09-27,11:50:45 | INFO | log_path: logs/example/ViT-B-32-20M/out.log | |
2022-09-27,11:50:45 | INFO | logs: logs/example | |
2022-09-27,11:50:45 | INFO | lr: 0.0005 | |
2022-09-27,11:50:45 | INFO | model: ViT-B-32 | |
2022-09-27,11:50:45 | INFO | name: ViT-B-32-20M | |
2022-09-27,11:50:45 | INFO | no_set_device_rank: False | |
2022-09-27,11:50:45 | INFO | norm_gradient_clip: None | |
2022-09-27,11:50:45 | INFO | precision: amp | |
2022-09-27,11:50:45 | INFO | pretrained: | |
2022-09-27,11:50:45 | INFO | pretrained_image: False | |
2022-09-27,11:50:45 | INFO | rank: 0 | |
2022-09-27,11:50:45 | INFO | report_to: tensorboard | |
2022-09-27,11:50:45 | INFO | resume: logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt | |
2022-09-27,11:50:45 | INFO | save_frequency: 1 | |
2022-09-27,11:50:45 | INFO | save_most_recent: True | |
2022-09-27,11:50:45 | INFO | seed: 0 | |
2022-09-27,11:50:45 | INFO | skip_scheduler: False | |
2022-09-27,11:50:45 | INFO | tensorboard: True | |
2022-09-27,11:50:45 | INFO | tensorboard_path: logs/example/ViT-B-32-20M/tensorboard | |
2022-09-27,11:50:45 | INFO | torchscript: False | |
2022-09-27,11:50:45 | INFO | trace: False | |
2022-09-27,11:50:45 | INFO | train_data: /p/scratch/ccstdl/katta1/LAION-400M/laion400m-dat-release/{00000..02033}.tar | |
2022-09-27,11:50:45 | INFO | train_num_samples: 20003822 | |
2022-09-27,11:50:45 | INFO | use_bn_sync: False | |
2022-09-27,11:50:45 | INFO | val_data: None | |
2022-09-27,11:50:45 | INFO | val_frequency: 1 | |
2022-09-27,11:50:45 | INFO | val_num_samples: None | |
2022-09-27,11:50:45 | INFO | wandb: False | |
2022-09-27,11:50:45 | INFO | wandb_notes: | |
2022-09-27,11:50:45 | INFO | warmup: 2000 | |
2022-09-27,11:50:45 | INFO | wd: 0.2 | |
2022-09-27,11:50:45 | INFO | workers: 8 | |
2022-09-27,11:50:45 | INFO | world_size: 32 | |
2022-09-27,11:50:45 | INFO | zeroshot_frequency: 1 | |
2022-09-27,11:50:49 | INFO | => resuming checkpoint 'logs/example/ViT-B-32-20M/checkpoints/epoch_latest.pt' (epoch 2) | |
2022-09-27,11:50:50 | INFO | Start epoch 2 | |
2022-09-27,11:51:02 | INFO | Train Epoch: 2 [ 8192/20054016 (0%)] Loss: 2.7975 (2.798) Data (t): 2.658 Batch (t): 12.794, 640.301/s LR: 0.000498 Logit Scale: 41.771 | |
2022-09-27,11:51:03 | INFO | Reducer buckets have been rebuilt in this iteration. | |
2022-09-27,11:51:49 | INFO | Train Epoch: 2 [ 827392/20054016 (4%)] Loss: 3.5042 (3.151) Data (t): 0.073 Batch (t): 0.461, 17648.2/s LR: 0.000498 Logit Scale: 42.328 | |
2022-09-27,11:52:35 | INFO | Train Epoch: 2 [ 1646592/20054016 (8%)] Loss: 2.9606 (3.087) Data (t): 0.073 Batch (t): 0.461, 17914.9/s LR: 0.000498 Logit Scale: 42.645 | |
2022-09-27,11:53:21 | INFO | Train Epoch: 2 [ 2465792/20054016 (12%)] Loss: 2.9361 (3.050) Data (t): 0.073 Batch (t): 0.460, 17689.3/s LR: 0.000498 Logit Scale: 43.034 | |
2022-09-27,11:54:07 | INFO | Train Epoch: 2 [ 3284992/20054016 (16%)] Loss: 2.8919 (3.018) Data (t): 0.073 Batch (t): 0.462, 17698.4/s LR: 0.000498 Logit Scale: 43.372 | |
2022-09-27,11:54:53 | INFO | Train Epoch: 2 [ 4104192/20054016 (20%)] Loss: 3.0383 (3.021) Data (t): 0.073 Batch (t): 0.460, 17500.3/s LR: 0.000498 Logit Scale: 43.761 | |
2022-09-27,11:55:39 | INFO | Train Epoch: 2 [ 4923392/20054016 (25%)] Loss: 3.0318 (3.023) Data (t): 0.073 Batch (t): 0.461, 17537.4/s LR: 0.000497 Logit Scale: 44.074 | |
2022-09-27,11:56:25 | INFO | Train Epoch: 2 [ 5742592/20054016 (29%)] Loss: 3.0539 (3.027) Data (t): 0.074 Batch (t): 0.461, 17437.9/s LR: 0.000497 Logit Scale: 44.365 | |
2022-09-27,11:57:11 | INFO | Train Epoch: 2 [ 6561792/20054016 (33%)] Loss: 2.8991 (3.013) Data (t): 0.073 Batch (t): 0.460, 17586.9/s LR: 0.000497 Logit Scale: 44.653 | |
2022-09-27,11:57:57 | INFO | Train Epoch: 2 [ 7380992/20054016 (37%)] Loss: 2.8849 (3.000) Data (t): 0.073 Batch (t): 0.461, 17781.6/s LR: 0.000497 Logit Scale: 44.939 | |
2022-09-27,11:58:43 | INFO | Train Epoch: 2 [ 8200192/20054016 (41%)] Loss: 2.9602 (2.996) Data (t): 0.073 Batch (t): 0.462, 17410.0/s LR: 0.000497 Logit Scale: 45.274 | |
2022-09-27,11:59:30 | INFO | Train Epoch: 2 [ 9019392/20054016 (45%)] Loss: 3.0347 (2.999) Data (t): 0.073 Batch (t): 0.461, 17987.2/s LR: 0.000497 Logit Scale: 45.578 | |
2022-09-27,12:00:16 | INFO | Train Epoch: 2 [ 9838592/20054016 (49%)] Loss: 2.8910 (2.991) Data (t): 0.073 Batch (t): 0.462, 17896.4/s LR: 0.000496 Logit Scale: 45.806 | |
2022-09-27,12:01:02 | INFO | Train Epoch: 2 [10657792/20054016 (53%)] Loss: 3.0885 (2.998) Data (t): 0.073 Batch (t): 0.460, 17844.2/s LR: 0.000496 Logit Scale: 46.044 | |
2022-09-27,12:01:48 | INFO | Train Epoch: 2 [11476992/20054016 (57%)] Loss: 2.8017 (2.985) Data (t): 0.074 Batch (t): 0.462, 17944.8/s LR: 0.000496 Logit Scale: 46.305 | |
2022-09-27,12:02:34 | INFO | Train Epoch: 2 [12296192/20054016 (61%)] Loss: 2.7566 (2.971) Data (t): 0.073 Batch (t): 0.460, 17502.2/s LR: 0.000496 Logit Scale: 46.519 | |
2022-09-27,12:03:20 | INFO | Train Epoch: 2 [13115392/20054016 (65%)] Loss: 2.8444 (2.963) Data (t): 0.074 Batch (t): 0.462, 17874.0/s LR: 0.000496 Logit Scale: 46.706 | |
2022-09-27,12:04:06 | INFO | Train Epoch: 2 [13934592/20054016 (69%)] Loss: 2.6265 (2.945) Data (t): 0.073 Batch (t): 0.462, 17448.2/s LR: 0.000496 Logit Scale: 46.888 | |
2022-09-27,12:04:53 | INFO | Train Epoch: 2 [14753792/20054016 (74%)] Loss: 2.9124 (2.943) Data (t): 0.073 Batch (t): 0.462, 17540.4/s LR: 0.000495 Logit Scale: 47.111 | |
2022-09-27,12:05:39 | INFO | Train Epoch: 2 [15572992/20054016 (78%)] Loss: 2.8231 (2.937) Data (t): 0.073 Batch (t): 0.461, 18001.5/s LR: 0.000495 Logit Scale: 47.402 | |
2022-09-27,12:06:25 | INFO | Train Epoch: 2 [16392192/20054016 (82%)] Loss: 2.6433 (2.923) Data (t): 0.074 Batch (t): 0.462, 17880.8/s LR: 0.000495 Logit Scale: 47.560 | |
2022-09-27,12:07:11 | INFO | Train Epoch: 2 [17211392/20054016 (86%)] Loss: 2.7611 (2.916) Data (t): 0.074 Batch (t): 0.464, 17470.8/s LR: 0.000495 Logit Scale: 47.772 | |
2022-09-27,12:07:58 | INFO | Train Epoch: 2 [18030592/20054016 (90%)] Loss: 2.6497 (2.904) Data (t): 0.073 Batch (t): 0.461, 17497.8/s LR: 0.000495 Logit Scale: 47.934 | |
2022-09-27,12:08:44 | INFO | Train Epoch: 2 [18849792/20054016 (94%)] Loss: 2.8205 (2.900) Data (t): 0.074 Batch (t): 0.462, 17784.4/s LR: 0.000494 Logit Scale: 48.150 | |
2022-09-27,12:09:31 | INFO | Train Epoch: 2 [19668992/20054016 (98%)] Loss: 2.6630 (2.891) Data (t): 0.077 Batch (t): 0.468, 17238.0/s LR: 0.000494 Logit Scale: 48.344 | |
2022-09-27,12:09:52 | INFO | Train Epoch: 2 [20054016/20054016 (100%)] Loss: 2.4420 (2.874) Data (t): 0.075 Batch (t): 0.462, 18285.9/s LR: 0.000494 Logit Scale: 48.412 | |
2022-09-27,12:09:52 | INFO | Starting zero-shot imagenet. | |
2022-09-27,12:09:52 | INFO | Building zero-shot classifier | |
2022-09-27,12:10:47 | INFO | Using classifier | |
2022-09-27,12:13:14 | INFO | Finished zero-shot imagenet. | |
2022-09-27,12:13:14 | INFO | Eval Epoch: 3 imagenet-zeroshot-val-top1: 0.2043 imagenet-zeroshot-val-top5: 0.4244 | |
2022-09-27,12:13:18 | INFO | Start epoch 3 | |
2022-09-27,12:13:20 | INFO | Train Epoch: 3 [ 8192/20054016 (0%)] Loss: 2.2881 (2.288) Data (t): 1.630 Batch (t): 2.023, 4050.13/s LR: 0.000494 Logit Scale: 48.422 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"imagenet-zeroshot-val-top1": 0.07508, "imagenet-zeroshot-val-top5": 0.19812} | |
{"imagenet-zeroshot-val-top1": 0.16212, "imagenet-zeroshot-val-top5": 0.35246} | |
{"imagenet-zeroshot-val-top1": 0.2043, "imagenet-zeroshot-val-top5": 0.42438} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment