Skip to content

Instantly share code, notes, and snippets.

`--> TORCH_LOGS="output_code" python optim_repro.py
[WARNING]:Profiler function <class 'torch.autograd.profiler.record_function'> will be ignored
[DEBUG]:Output code:
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
AlbertForMaskedLM
AlbertForQuestionAnswering
BartForCausalLM
BartForConditionalGeneration
BertForMaskedLM
BertForQuestionAnswering
BlenderbotSmallForCausalLM
BlenderbotSmallForConditionalGeneration
DebertaForMaskedLM
DebertaForQuestionAnswering
import torch
import torch._dynamo as torchdynamo
import torch._inductor
import time
import torch._inductor.config as config
from torch._dynamo.utils import cprofile_wrapper
from apex.optimizers import FusedAdam, FusedSGD
config.triton.cudagraphs = True
config.cpp_wrapper = False
def init_state_per_param(self, param, param_group):
state = self.state[param]
if len(state) == 0:
# note(crcrpar): [special device hosting for step]
# Deliberately host `step` on CPU if both capturable and fused are off.
# This is because kernel launches are costly on CUDA and XLA.
state['step'] = (
torch.zeros((), dtype=_get_scalar_dtype(is_fused=param_group['fused']), device=param.device)
if param_group['capturable'] or param_group['fused']
else torch.tensor(0.0, dtype=_get_scalar_dtype())
import torch
import torch._dynamo as torchdynamo
import torch._inductor
import time
import torch._inductor.config as config
from torch._dynamo.utils import cprofile_wrapper
from apex.optimizers import FusedAdam, FusedSGD
config.triton.cudagraphs = True
config.cpp_wrapper = False
from torch._dynamo.decorators import mark_static_address
@torch.compile(mode="reduce-overhead")
def foo(inp):
return inp + 1
inp = torch.rand([20, 20], device="cuda")
mark_static_address(inp, guard=False)
foo(inp)
inp = torch.ones([20, 20], device="cuda")
import torch
import torch._dynamo as torchdynamo
import torch._inductor
import time
import torch._inductor.config as config
from apex.optimizers import FusedAdam, FusedSGD
config.triton.cudagraphs = True
config.cpp_wrapper = False
@mlazos
mlazos / repro.py
Created February 22, 2024 03:31
log graph break bug
import torch
torch.set_default_device("cuda")
import logging
logger = logging.getLogger(__name__)
@torch.compile()
@mlazos
mlazos / output.txt
Created February 22, 2024 03:31
bytecode
[DEBUG]:MODIFIED BYTECODE precondition2 /opt/dlami/nvme/mlazos/test/a.py line 10
10 0 LOAD_GLOBAL 8 (__compiled_fn_0)
2 LOAD_FAST 1 (_masked_preconditioner_list)
4 LOAD_CONST 8 (0)
6 BINARY_SUBSCR
8 LOAD_FAST 1 (_masked_preconditioner_list)
10 LOAD_CONST 9 (1)
12 BINARY_SUBSCR
14 CALL_FUNCTION 2
16 STORE_FAST 6 (graph_out_0)
@mlazos
mlazos / cmd.txt
Last active February 16, 2024 00:29
444c628e0675738648e53cd9a66ad6e126f45c90
python huggingface.py --training --float32 --performance --only MobileBertForQuestionAnswering --backend=inductor