Michael Lazos mlazos

## gist:5e197ee3e4ff8e2fff400b7630a769ef
TORCH_LOGS="recompiles" PYTORCH_TEST_WITH_DYNAMO=1 python test/test_optim.py -k test_foreach_matches_forloop_AdamW_cuda
[WARNING]:Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
[DEBUG]:Recompiling function wrapper in /data/users/mlazos/pytorch/torch/optim/optimizer.py:463
    triggered by the following guard failure(s):
    - ___check_obj_id(L['args'][0], 139797595855440)
[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adamw.py:190
    triggered by the following guard failure(s):
    - ___check_obj_id(L['self'], 139797595855440)
[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adamw.py:190
    triggered by the following guard failure(s):

## gist:9611b96b2eb1d6993acb7744f8ae2eb8
--> TORCH_LOGS="dynamo" python nvembed.py
setup passages
A new version of the following files was downloaded from https://huggingface.co/nvidia/NV-Embed-v1:
- configuration_nvembed.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/nvidia/NV-Embed-v1:
- modeling_nvembed.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading shards: 100%|███████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  9.86it/s]
Loading checkpoint shards: 100%|████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.16it/s]

## gist:cdd15b93ea6096ace88d0a5cf9fc802b
--> TORCH_LOGS="dynamo" python nvembed.py
setup passages
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.07it/s]
downloaded model
[INFO]:Step 1: torchdynamo start tracing fn /data/users/mlazos/empathy_day/nvembed.py:31
[INFO]:Restarting analysis due to _dynamo/symbolic_convert.py:148 in fail_and_restart_analysis
[INFO]:Step 1: torchdynamo start tracing fn /data/users/mlazos/empathy_day/nvembed.py:31
[INFO]:produce_guards
[INFO]:Step 1: torchdynamo start tracing encode /home/mlazos/.cache/huggingface/modules/transformers_modules/nvidia/NV-Embed-v1/97aefcdd69565404f4a24de8ca4eb8114cb25ff0/modeling_nvembed.py:403
[INFO]:Restarting analysis due to _dynamo/symbolic_convert.py:148 in fail_and_restart_analysis

## repro.py
import torch
from torch.optim.lr_scheduler import LambdaLR, ChainedScheduler, ConstantLR, SequentialLR
from torch.testing._internal.common_utils import CudaMemoryLeakCheck, TestCase, run_tests

def chained_fn():
    from torch.testing._internal.common_utils import CudaMemoryLeakCheck
    with CudaMemoryLeakCheck(None,name="hi"):
        device="cuda:0"
        dtype=torch.float32
        optim_cls = torch.optim.ASGD

## optim output code
`--> TORCH_LOGS="output_code" python optim_repro.py
[WARNING]:Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
[DEBUG]:Output code:

# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os

## hf_models.txt
AlbertForMaskedLM
AlbertForQuestionAnswering
BartForCausalLM
BartForConditionalGeneration
BertForMaskedLM
BertForQuestionAnswering
BlenderbotSmallForCausalLM
BlenderbotSmallForConditionalGeneration
DebertaForMaskedLM
DebertaForQuestionAnswering

## optimizer_bench.py
import torch
import torch._dynamo as torchdynamo
import torch._inductor
import time
import torch._inductor.config as config
from torch._dynamo.utils import cprofile_wrapper
from apex.optimizers import FusedAdam, FusedSGD

config.triton.cudagraphs = True
config.cpp_wrapper = False

## gist:6a6d45f7d5cf2ab41a7a77bf96862bae
    def init_state_per_param(self, param, param_group):
        state = self.state[param]
        if len(state) == 0:
            # note(crcrpar): [special device hosting for step]
            # Deliberately host `step` on CPU if both capturable and fused are off.
            # This is because kernel launches are costly on CUDA and XLA.
            state['step'] = (
                torch.zeros((), dtype=_get_scalar_dtype(is_fused=param_group['fused']), device=param.device)
                if param_group['capturable'] or param_group['fused']
                else torch.tensor(0.0, dtype=_get_scalar_dtype())

## benchmark.py
import torch
import torch._dynamo as torchdynamo
import torch._inductor
import time
import torch._inductor.config as config
from torch._dynamo.utils import cprofile_wrapper
from apex.optimizers import FusedAdam, FusedSGD

config.triton.cudagraphs = True
config.cpp_wrapper = False

## gist:ff6c8ead6bb0f4b17e7d2342317feb14
            from torch._dynamo.decorators import mark_static_address

            @torch.compile(mode="reduce-overhead")
            def foo(inp):
                return inp + 1

            inp = torch.rand([20, 20], device="cuda")
            mark_static_address(inp, guard=False)
            foo(inp)
            inp = torch.ones([20, 20], device="cuda")
	TORCH_LOGS="recompiles" PYTORCH_TEST_WITH_DYNAMO=1 python test/test_optim.py -k test_foreach_matches_forloop_AdamW_cuda
	[WARNING]:Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
	[DEBUG]:Recompiling function wrapper in /data/users/mlazos/pytorch/torch/optim/optimizer.py:463
	triggered by the following guard failure(s):
	- ___check_obj_id(L['args'][0], 139797595855440)
	[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adamw.py:190
	triggered by the following guard failure(s):
	- ___check_obj_id(L['self'], 139797595855440)
	[DEBUG]:Recompiling function step in /data/users/mlazos/pytorch/torch/optim/adamw.py:190
	triggered by the following guard failure(s):
	--> TORCH_LOGS="dynamo" python nvembed.py
	setup passages
	A new version of the following files was downloaded from https://huggingface.co/nvidia/NV-Embed-v1:
	- configuration_nvembed.py
	. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
	A new version of the following files was downloaded from https://huggingface.co/nvidia/NV-Embed-v1:
	- modeling_nvembed.py
	. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
	Downloading shards: 100%\|███████████████████████████████████████████████████████████\| 4/4 [00:00<00:00, 9.86it/s]
	Loading checkpoint shards: 100%\|████████████████████████████████████████████████████\| 4/4 [00:03<00:00, 1.16it/s]
	import torch
	from torch.optim.lr_scheduler import LambdaLR, ChainedScheduler, ConstantLR, SequentialLR
	from torch.testing._internal.common_utils import CudaMemoryLeakCheck, TestCase, run_tests

	def chained_fn():
	from torch.testing._internal.common_utils import CudaMemoryLeakCheck
	with CudaMemoryLeakCheck(None,name="hi"):
	device="cuda:0"
	dtype=torch.float32
	optim_cls = torch.optim.ASGD
	`--> TORCH_LOGS="output_code" python optim_repro.py
	[WARNING]:Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
	[DEBUG]:Output code:

	# AOT ID: ['0_inference']
	from ctypes import c_void_p, c_long
	import torch
	import math
	import random
	import os
	AlbertForMaskedLM
	AlbertForQuestionAnswering
	BartForCausalLM
	BartForConditionalGeneration
	BertForMaskedLM
	BertForQuestionAnswering
	BlenderbotSmallForCausalLM
	BlenderbotSmallForConditionalGeneration
	DebertaForMaskedLM
	DebertaForQuestionAnswering
	import torch
	import torch._dynamo as torchdynamo
	import torch._inductor
	import time
	import torch._inductor.config as config
	from torch._dynamo.utils import cprofile_wrapper
	from apex.optimizers import FusedAdam, FusedSGD

	config.triton.cudagraphs = True
	config.cpp_wrapper = False
	def init_state_per_param(self, param, param_group):
	state = self.state[param]
	if len(state) == 0:
	# note(crcrpar): [special device hosting for step]
	# Deliberately host `step` on CPU if both capturable and fused are off.
	# This is because kernel launches are costly on CUDA and XLA.
	state['step'] = (
	torch.zeros((), dtype=_get_scalar_dtype(is_fused=param_group['fused']), device=param.device)
	if param_group['capturable'] or param_group['fused']
	else torch.tensor(0.0, dtype=_get_scalar_dtype())
	from torch._dynamo.decorators import mark_static_address

	@torch.compile(mode="reduce-overhead")
	def foo(inp):
	return inp + 1

	inp = torch.rand([20, 20], device="cuda")
	mark_static_address(inp, guard=False)
	foo(inp)
	inp = torch.ones([20, 20], device="cuda")