Skip to content

Instantly share code, notes, and snippets.

@xmfan
xmfan / gist:a95d2144e7d27cc5df2ad64e2bf9ae82
Created November 8, 2023 19:07
python benchmarks/dynamo/torchbench.py --performance --cold-start-latency --training --amp --backend inductor --device cuda --print-memory --only=stable_diffusion_unet --ddp --multiprocess --batch_size=8 --disable-cudagraphs
Backend dynamo failed in warmup()
Traceback (most recent call last):
File "/home/xmfan/core/pytorch/benchmarks/dynamo/common.py", line 2604, in warmup
fn(model, example_inputs)
File "/home/xmfan/.conda/envs/oss/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 410, in _fn
return fn(*args, **kwargs)
File "/home/xmfan/core/pytorch/benchmarks/dynamo/torchbench.py", line 512, in forward_and_backward_pass
cloned_inputs = clone_inputs(inputs)
File "/home/xmfan/core/pytorch/benchmarks/dynamo/torchbench.py", line 513, in resume_in_forward_and_backward_pass
self.optimizer_zero_grad(mod)
===== Compiled autograd graph =====
<eval_with_key>.53 class CompiledAutograd(torch.nn.Module):
def forward(self, inputs, sizes, hooks):
# No stacktrace found for following nodes
getitem: "f32[]" = inputs[0]
@xmfan
xmfan / gist:61575069c68a2d42597c539d71e19326
Created January 12, 2024 18:58
rank-based TORCH_LOGS sample
diff --git a/torch/_logging/_internal.py b/torch/_logging/_internal.py
index 7e4552f0f8e..ab75112bc3b 100644
--- a/torch/_logging/_internal.py
+++ b/torch/_logging/_internal.py
@@ -667,7 +667,16 @@ def _is_valid_module(qname):
def _update_log_state_from_env():
global log_state
log_setting = os.environ.get(LOG_ENV_VAR, None)
- if log_setting is not None:
+
import os
import torch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch._dynamo.utils import maybe_enable_compiled_autograd
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
dist.init_process_group("nccl", rank=rank, world_size=world_size)
torch.cuda.set_device(rank)
@xmfan
xmfan / simple_cpu_codegen.py
Last active February 3, 2024 05:29
output code from TORCH_LOGS="output_code" python simple.py, with inputs baked in
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
@xmfan
xmfan / simple.py
Last active February 3, 2024 05:28
import torch
from torch._dynamo.utils import maybe_enable_compiled_autograd
def fn():
model = torch.nn.Sequential(
torch.nn.Linear(2, 1, bias=False),
torch.nn.Linear(1, 2, bias=False),
)
model[0].weight = torch.nn.Parameter(torch.tensor([[-0.0053, 0.3793]]))
model[1].weight = torch.nn.Parameter(torch.tensor([[-0.8230],[-0.7359]]))
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
import torch
def compiler_fn(gm):
return torch.compile(gm, mode="reduce-overhead", fullgraph=True, dynamic=True)
def fn():
x = torch.randn(2, 2, device="cuda", requires_grad=True)
y = torch.randn(2, 2, device="cuda")
out = torch.mm(x, y)
loss = out.sum() / out.numel()
import torch
def fn2():
x = torch.randn(2, 2, device="cuda")
y = 5
return x / y
torch.compile(fn2, mode="reduce-overhead")()
@xmfan
xmfan / main.py
Created March 7, 2024 00:11
torch.compile(lumiere_pytorch)
import time as time_module
import torch
from lumiere_pytorch import MPLumiere
import logging
from denoising_diffusion_pytorch import KarrasUnet
karras_unet = KarrasUnet(
image_size = 256,
dim = 8,