Animesh Jain anijain2305

## Benchmarked CSR SDDMM
WARNING: torch sampled_addmm does not support batch indices. Benchmarked by iterating over batches. Can be improved significantly
Swin Transformer
[----------------------------------------------------------- sddmm ------------------------------------------------------------]
                                          |  torch_dense  |  torch_sddmm  |  csr_sputnik  |   csr_ge   |   coo_ge  |  csr_to_coo
1 threads: ---------------------------------------------------------------------------------------------------------------------
      B=  96, M=3136, K= 32, prob=0.0000  |     9532.0    |    47358.3    |    20816.9    |  106767.1  |  44071.4  |    259.6
      B= 192, M= 784, K= 32, prob=0.7500  |     1168.4    |     9095.9    |      728.3    |    2942.2  |   1480.8  |    117.1
      B= 384, M= 196, K= 32, prob=0.9375  |      171.0    |    17444.5    |      103.0    |      91.8  |    148.3  |    117.3
      B= 768, M=  49, K= 32, prob=0.9844  |       78.6    |    34361.4    |       18.7    |      12.8  |    145.

## Checkpointing Params
import importlib
import gc
import os
import sys
import logging
import torch
from os.path import abspath
from os.path import exists
import itertools

## gist:e93ea919c4325d2ef00a16039dd4c0bf
(/scratch/anijain/work/env)  anijain@a100-st-p4d24xlarge-58 /scratch/anijain/work/torchdynamo (wconstab/dynamic) $ python benchmarks/torchbench.py --dynamic_shapes --training --nvfuser --accuracy-aot-ts-mincut --devices cuda --repeat 1 -k hf_Bert
cuda train hf_Bert                            DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
DEBUG torchdynamo.optimizations.training: Unable to use AOT

## gist:c7c63fbc22e991c0a26cf9afeea72af7
#!/bin/bash
set -x -e

cd /data/home/binbao/cluster/torchdynamo-update-pin/
git checkout main && git pull && git branch -D binbao/update_pytorch_pin && git checkout binbao/update_pytorch_pin && git rebase main
nightly=`date +'%Y%m%d'`
sed -i 's/PYTORCH_VERSION ?= dev.*/PYTORCH_VERSION ?= dev'$nightly'/' Makefile
sed -i 's/\.dev.*+/\.dev'$nightly'+/g' README.md
git commit -a -m "Update PyTorch pin"
git push -f

## Increase in Peak Memory


import torch
from torch import tensor, device
import torch.fx as fx
from torchdynamo.testing import rand_strided
from math import inf
from torch.fx.experimental.proxy_tensor import make_fx

# torch version: 1.13.0a0+git071f875

## hf_t5_large - most time consuming kernel

import triton
import triton.language as tl
from torchinductor.ir import ReductionHint
from torchinductor.triton_ops.autotune import pointwise
from torchinductor.utils import instance_descriptor

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*i64', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp32', 11: '*fp32', 12: '*fp32', 13: '*fp32', 14: '*fp32', 15: '*fp32', 16: '*fp32', 17: '*fp32', 18: '*fp32', 19: '*fp32', 20: '*fp32', 21: '*fp32', 22: '*fp32', 23: '*fp32', 24: '*fp32', 25: '*fp32', 26: '*fp32', 27: '*fp32', 28: '*fp32', 29: '*fp32', 30: '*fp32', 31: '*fp32', 32: '*fp32', 33: '*fp32', 34: '*fp32', 35: '*fp32', 36: '*fp32', 37: '*fp32', 38: '*fp32', 39: '*fp32', 40: '*fp32', 41: '*fp32', 42: '*fp32', 43: '*fp32', 44: '*fp32', 45: '*fp32', 46: '*fp32', 47: '*fp32', 48: '*fp32', 49: '*fp32', 50: '*fp32', 51: '*fp32', 52: '*fp32', 53: '*fp32', 54: '*fp32', 55: '*fp32', 56: '*fp32', 57: '*fp32', 58: '*fp32',

## hf_t5_large_time_consuming.py

import triton
import triton.language as tl
from torchinductor.ir import ReductionHint
from torchinductor.triton_ops.autotune import pointwise
from torchinductor.utils import instance_descriptor

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*i64', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp32', 11: '*fp32', 12: '*fp32', 13: '*fp32', 14: '*fp32', 15: '*fp32', 16: '*fp32', 17: '*fp32', 18: '*fp32', 19: '*fp32', 20: '*fp32', 21: '*fp32', 22: '*fp32', 23: '*fp32', 24: '*fp32', 25: '*fp32', 26: '*fp32', 27: '*fp32', 28: '*fp32', 29: '*fp32', 30: '*fp32', 31: '*fp32', 32: '*fp32', 33: '*fp32', 34: '*fp32', 35: '*fp32', 36: '*fp32', 37: '*fp32', 38: '*fp32', 39: '*fp32', 40: '*fp32', 41: '*fp32', 42: '*fp32', 43: '*fp32', 44: '*fp32', 45: '*fp32', 46: '*fp32', 47: '*fp32', 48: '*fp32', 49: '*fp32', 50: '*fp32', 51: '*fp32', 52: '*fp32', 53: '*fp32', 54: '*fp32', 55: '*fp32', 56: '*fp32', 57: '*fp32', 58: '*fp32',

## full_file.py

from ctypes import c_void_p, c_long
import torch
import random
from torch import empty_strided, as_strided, device
from torchinductor.codecache import AsyncCompile

aten = torch.ops.aten
async_compile = AsyncCompile()

## second_trial.py

import triton
import triton.language as tl
from torchinductor.ir import ReductionHint
from torchinductor.triton_ops.autotune import pointwise
from torchinductor.utils import instance_descriptor

@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: '*fp32', 1: '*i64', 2: '*fp32', 3: '*fp32', 4: '*fp32', 5: '*fp32', 6: '*fp32', 7: '*fp32', 8: '*fp32', 9: '*fp32', 10: '*fp32', 11: '*fp32', 12: '*fp32', 13: '*fp32', 14: '*fp32', 15: '*fp32', 16: '*fp32', 17: '*fp32', 18: '*fp32', 19: '*fp32', 20: '*fp32', 21: '*fp32', 22: '*fp32', 23: '*fp32', 24: '*fp32', 25: '*fp32', 26: '*fp32', 27: '*fp32', 28: '*fp32', 29: '*fp32', 30: '*fp32', 31: '*fp32', 32: '*fp32', 33: '*fp32', 34: '*fp32', 35: '*fp32', 36: '*fp32', 37: '*fp32', 38: '*fp32', 39: '*fp32', 40: '*fp32', 41: '*fp32', 42: '*fp32', 43: '*fp32', 44: '*fp32', 45: '*fp32', 46: '*fp32', 47: '*fp32', 48: '*fp32', 49: '*fp32', 50: '*fp32', 51: '*fp32', 52: '*fp32', 53: '*fp32', 54: '*fp32', 55: '*fp32', 56: '*fp32', 57: '*fp32', 58: '*fp32',

## relu.py

from ctypes import c_void_p, c_long
import torch
import random
from torch import empty_strided, as_strided, device
from torchinductor.codecache import AsyncCompile

aten = torch.ops.aten
async_compile = AsyncCompile()
	WARNING: torch sampled_addmm does not support batch indices. Benchmarked by iterating over batches. Can be improved significantly
	Swin Transformer
	[----------------------------------------------------------- sddmm ------------------------------------------------------------]
	\| torch_dense \| torch_sddmm \| csr_sputnik \| csr_ge \| coo_ge \| csr_to_coo
	1 threads: ---------------------------------------------------------------------------------------------------------------------
	B= 96, M=3136, K= 32, prob=0.0000 \| 9532.0 \| 47358.3 \| 20816.9 \| 106767.1 \| 44071.4 \| 259.6
	B= 192, M= 784, K= 32, prob=0.7500 \| 1168.4 \| 9095.9 \| 728.3 \| 2942.2 \| 1480.8 \| 117.1
	B= 384, M= 196, K= 32, prob=0.9375 \| 171.0 \| 17444.5 \| 103.0 \| 91.8 \| 148.3 \| 117.3
	B= 768, M= 49, K= 32, prob=0.9844 \| 78.6 \| 34361.4 \| 18.7 \| 12.8 \| 145.
	import importlib
	import gc
	import os
	import sys
	import logging
	import torch
	from os.path import abspath
	from os.path import exists
	import itertools
	(/scratch/anijain/work/env) anijain@a100-st-p4d24xlarge-58 /scratch/anijain/work/torchdynamo (wconstab/dynamic) $ python benchmarks/torchbench.py --dynamic_shapes --training --nvfuser --accuracy-aot-ts-mincut --devices cuda --repeat 1 -k hf_Bert
	cuda train hf_Bert DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT Autograd because graph has mutation
	DEBUG torchdynamo.optimizations.training: Unable to use AOT
	#!/bin/bash
	set -x -e

	cd /data/home/binbao/cluster/torchdynamo-update-pin/
	git checkout main && git pull && git branch -D binbao/update_pytorch_pin && git checkout binbao/update_pytorch_pin && git rebase main
	nightly=`date +'%Y%m%d'`
	sed -i 's/PYTORCH_VERSION ?= dev.*/PYTORCH_VERSION ?= dev'$nightly'/' Makefile
	sed -i 's/\.dev.*+/\.dev'$nightly'+/g' README.md
	git commit -a -m "Update PyTorch pin"
	git push -f


	import torch
	from torch import tensor, device
	import torch.fx as fx
	from torchdynamo.testing import rand_strided
	from math import inf
	from torch.fx.experimental.proxy_tensor import make_fx

	# torch version: 1.13.0a0+git071f875

	import triton
	import triton.language as tl
	from torchinductor.ir import ReductionHint
	from torchinductor.triton_ops.autotune import pointwise
	from torchinductor.utils import instance_descriptor

	@pointwise(size_hints=[4194304], filename=__file__, meta={'signature': {0: 'fp32', 1: 'i64', 2: 'fp32', 3: 'fp32', 4: 'fp32', 5: 'fp32', 6: 'fp32', 7: 'fp32', 8: 'fp32', 9: 'fp32', 10: 'fp32', 11: 'fp32', 12: 'fp32', 13: 'fp32', 14: 'fp32', 15: 'fp32', 16: 'fp32', 17: 'fp32', 18: 'fp32', 19: 'fp32', 20: 'fp32', 21: 'fp32', 22: 'fp32', 23: 'fp32', 24: 'fp32', 25: 'fp32', 26: 'fp32', 27: 'fp32', 28: 'fp32', 29: 'fp32', 30: 'fp32', 31: 'fp32', 32: 'fp32', 33: 'fp32', 34: 'fp32', 35: 'fp32', 36: 'fp32', 37: 'fp32', 38: 'fp32', 39: 'fp32', 40: 'fp32', 41: 'fp32', 42: 'fp32', 43: 'fp32', 44: 'fp32', 45: 'fp32', 46: 'fp32', 47: 'fp32', 48: 'fp32', 49: 'fp32', 50: 'fp32', 51: 'fp32', 52: 'fp32', 53: 'fp32', 54: 'fp32', 55: 'fp32', 56: 'fp32', 57: 'fp32', 58: '*fp32',

	from ctypes import c_void_p, c_long
	import torch
	import random
	from torch import empty_strided, as_strided, device
	from torchinductor.codecache import AsyncCompile

	aten = torch.ops.aten
	async_compile = AsyncCompile()