Skip to content

Instantly share code, notes, and snippets.

@leslie-fang-intel
Created June 14, 2024 07:52
Show Gist options
  • Save leslie-fang-intel/00563f35f7edd91369073672e79b1961 to your computer and use it in GitHub Desktop.
Save leslie-fang-intel/00563f35f7edd91369073672e79b1961 to your computer and use it in GitHub Desktop.
issue_128513
This file has been truncated, but you can view the full file.
loading model: 0it [00:00, ?it/s]Input ids are automatically padded from 819 to 832 to be a multiple of `config.block_size`: 64
loading model: 0it [00:02, ?it/s]
cpu eval hf_BigBird
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] TRACED GRAPH
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] ===== FROZEN GRAPH =====
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] def forward(self):
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1] return ()
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]
V0614 00:48:29.163000 140616046391680 torch/_inductor/freezing.py:118] [0/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] TRACED GRAPH
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] ===== FROZEN GRAPH =====
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] def forward(self, arg0_1: "i64[1, 4096]"):
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] full_default: "f32[1, 819]" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] slice_2: "i64[1, 819]" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819); arg0_1 = None
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] expand: "i64[1, 819]" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1] return (full_default, expand)
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
V0614 00:48:29.254000 140616046391680 torch/_inductor/freezing.py:118] [2/0_1]
I0614 00:48:29.255000 140616046391680 torch/_dynamo/logging.py:56] [2/0_1] Step 3: torchinductor compiling FORWARDS graph 1
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] def forward(self, arg0_1: "i64[1, 4096][4096, 1]cpu"):
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2039 in forward, code: attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] full_default: "f32[1, 819][819, 1]cpu" = torch.ops.aten.full.default([1, 819], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2042 in forward, code: buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] slice_2: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 1, 0, 819); arg0_1 = None
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2043 in forward, code: buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] expand: "i64[1, 819][4096, 1]cpu" = torch.ops.aten.expand.default(slice_2, [1, 819]); slice_2 = None
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs] return (full_default, expand)
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.363000 140616046391680 torch/_inductor/compile_fx.py:748] [2/0_1] [__post_grad_graphs]
V0614 00:48:29.364000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:29.365000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %full_default : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([1, 819], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:29.365000 140616046391680 torch/_inductor/graph.py:976] [2/0_1] via <function full at 0x7fe21799c550>
V0614 00:48:29.366000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg0_1, 1, 0, 819), kwargs = {})
V0614 00:48:29.366000 140616046391680 torch/_inductor/graph.py:976] [2/0_1] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering %expand : [num_users=1] = call_function[target=torch.ops.aten.expand.default](args = (%slice_2, [1, 819]), kwargs = {})
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:976] [2/0_1] via <function expand at 0x7fe2179c2cb0>
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1173] [2/0_1] lowering return (full_default, expand)
V0614 00:48:29.367000 140616046391680 torch/_inductor/graph.py:1097] [2/0_1] Force channels last inputs for 0 conv for the current graph with id 1
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.float32, size=[1, 819], stride=[819, 1]), data=Pointwise(
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] 'cpu',
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] torch.float32,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] def inner_fn(index):
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] _, i1 = index
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] return tmp0
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] ,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] ranges=[1, 819],
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] origin_node=full_default,
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] origins={full_default}
V0614 00:48:29.393000 140616046391680 torch/_inductor/scheduler.py:1601] [2/0_1] ))
V0614 00:48:29.394000 140616046391680 torch/_inductor/scheduler.py:1671] [2/0_1] scheduling output buf0
V0614 00:48:29.394000 140616046391680 torch/_inductor/scheduler.py:1671] [2/0_1] scheduling output arg0_1
V0614 00:48:29.395000 140616046391680 torch/_inductor/scheduler.py:2688] [2/0_1] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] get_bounds:
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] graph():
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] %constant : [num_users=1] = call_method[target=constant](args = (%ops, 1.0, torch.float32), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index, %constant, None), kwargs = {})
V0614 00:48:29.396000 140616046391680 torch/_inductor/bounds.py:63] [2/0_1] return store
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] Output code:
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] # AOT ID: ['1_inference']
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import torch
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import math
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import random
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import os
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] import tempfile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from math import inf, nan
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] cpp_fused_ones_0 = async_compile.cpp_pybinding(['float*'], '''
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] extern "C" void kernel(float* out_ptr0)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(816L); x0+=static_cast<long>(16L))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] auto tmp0 = static_cast<float>(1.0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] auto tmp1 = at::vec::Vectorized<float>(tmp0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] tmp1.store(out_ptr0 + static_cast<long>(x0));
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] #pragma omp simd simdlen(8)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] for(long x0=static_cast<long>(816L); x0<static_cast<long>(819L); x0+=static_cast<long>(1L))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] {
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] auto tmp0 = static_cast<float>(1.0);
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] out_ptr0[static_cast<long>(x0)] = tmp0;
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] }
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] ''')
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] del async_compile
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] def call(args):
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] arg0_1, = args
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] args.clear()
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] assert_size_stride(arg0_1, (1, 4096), (4096, 1))
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] buf0 = empty_strided_cpu((1, 819), (819, 1), torch.float32)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] cpp_fused_ones_0(buf0)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] return (buf0, reinterpret_tensor(arg0_1, (1, 819), (4096, 1), 0), )
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._dynamo.testing import rand_strided
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.utils import print_performance
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] arg0_1 = rand_strided((1, 4096), (4096, 1), device='cpu', dtype=torch.int64)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] fn = lambda: call([arg0_1])
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code] compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:31.426000 140616046391680 torch/_inductor/graph.py:1681] [2/0_1] [__output_code]
V0614 00:48:32.441000 140616046391680 torch/_inductor/graph.py:1714] [2/0_1] Output code written to: /tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py
I0614 00:48:32.441000 140616046391680 torch/_inductor/graph.py:1715] [2/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/ko/ckovbmwilyxv4sl5d74oe6jsmj25rub5gzv5kco7u66lavi64ivy.py
V0614 00:48:32.443000 140616046391680 torch/_inductor/compile_fx.py:531] [2/0_1] FX codegen and compilation took 3.188s
I0614 00:48:32.443000 140616046391680 torch/_dynamo/logging.py:56] [2/0_1] Step 3: torchinductor done compiling FORWARDS graph 1
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] TRACED GRAPH
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] ===== FROZEN GRAPH =====
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] def forward(self, arg0_1: "i64[1, 819]", arg1_1: "f32[1, 819]", arg2_1: "i64[1, 819]"):
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] constant_pad_nd: "i64[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] constant_pad_nd_1: "f32[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] constant_pad_nd_2: "i64[1, 832]" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0] return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
V0614 00:48:32.507000 140616046391680 torch/_inductor/freezing.py:118] [4/0]
I0614 00:48:32.508000 140616046391680 torch/_dynamo/logging.py:56] [4/0] Step 3: torchinductor compiling FORWARDS graph 2
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] def forward(self, arg0_1: "i64[1, 819][819, 1]cpu", arg1_1: "f32[1, 819][819, 1]cpu", arg2_1: "i64[1, 819][4096, 1]cpu"):
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] constant_pad_nd: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg0_1, [0, 13], 0.0); arg0_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] constant_pad_nd_1: "f32[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg1_1, [0, 13], 0.0); arg1_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] # File: /localdisk/leslie/torch_inductor_community/pytorch/torch/nn/functional.py:4552 in pad, code: return torch._C._nn.pad(input, pad, mode, value)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] constant_pad_nd_2: "i64[1, 832][832, 1]cpu" = torch.ops.aten.constant_pad_nd.default(arg2_1, [0, 13], 0.0); arg2_1 = None
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs] return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.510000 140616046391680 torch/_inductor/compile_fx.py:748] [4/0] [__post_grad_graphs]
V0614 00:48:32.511000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:32.511000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg0_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.512000 140616046391680 torch/_inductor/graph.py:976] [4/0] via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.513000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd_1 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg1_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.513000 140616046391680 torch/_inductor/graph.py:976] [4/0] via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering %constant_pad_nd_2 : [num_users=1] = call_function[target=torch.ops.aten.constant_pad_nd.default](args = (%arg2_1, [0, 13], 0.0), kwargs = {})
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:976] [4/0] via <function constant_pad_nd at 0x7fe21799e7a0>
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1173] [4/0] lowering return (constant_pad_nd, constant_pad_nd_1, constant_pad_nd_2)
V0614 00:48:32.514000 140616046391680 torch/_inductor/graph.py:1097] [4/0] Force channels last inputs for 0 conv for the current graph with id 2
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.int64, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] 'cpu',
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] torch.int64,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] def inner_fn(index):
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] _, i1 = index
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp2 = tmp0 < tmp1
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp3 = ops.load(arg0_1, i1)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp4 = ops.masked(tmp2, tmp3, 0)
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] return tmp4
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ranges=[1, 832],
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origin_node=constant_pad_nd,
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origins={constant_pad_nd}
V0614 00:48:32.526000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] 'cpu',
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] torch.float32,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] def inner_fn(index):
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] _, i1 = index
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp2 = tmp0 < tmp1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp3 = ops.load(arg1_1, i1)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp4 = ops.masked(tmp2, tmp3, 0.0)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] return tmp4
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ranges=[1, 832],
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origin_node=constant_pad_nd_1,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origins={constant_pad_nd_1}
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.int64, size=[1, 832], stride=[832, 1]), data=Pointwise(
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] 'cpu',
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] torch.int64,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] def inner_fn(index):
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] _, i1 = index
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp0 = ops.index_expr(i1, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp1 = ops.index_expr(819, torch.int64)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp2 = tmp0 < tmp1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp3 = ops.load(arg2_1, i1)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] tmp4 = ops.masked(tmp2, tmp3, 0)
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] return tmp4
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ranges=[1, 832],
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origin_node=constant_pad_nd_2,
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] origins={constant_pad_nd_2}
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1601] [4/0] ))
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf0
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf1
V0614 00:48:32.527000 140616046391680 torch/_inductor/scheduler.py:1671] [4/0] scheduling output buf2
V0614 00:48:32.528000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.529000 140616046391680 torch/_inductor/bounds.py:63] [4/0] return store
V0614 00:48:32.533000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf1 with estimated runtime 0.000000
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0.0), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %store : [num_users=1] = call_method[target=store](args = (%ops, buf1, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.534000 140616046391680 torch/_inductor/bounds.py:63] [4/0] return store
V0614 00:48:32.537000 140616046391680 torch/_inductor/scheduler.py:2688] [4/0] Generating code for node buf2 with estimated runtime 0.000000
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] get_bounds:
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] graph():
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %index_expr : [num_users=1] = call_method[target=index_expr](args = (%ops, %get_index, torch.int64), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %constant : [num_users=1] = call_method[target=constant](args = (%ops, 819, torch.int64), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %lt : [num_users=1] = call_method[target=lt](args = (%ops, %index_expr, %constant), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %masked_subblock1 : [num_users=1] = call_module[target=masked_subblock1](args = (%lt, 0), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] %store : [num_users=1] = call_method[target=store](args = (%ops, buf2, %get_index_1, %masked_subblock1, None), kwargs = {})
V0614 00:48:32.538000 140616046391680 torch/_inductor/bounds.py:63] [4/0] return store
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] Output code:
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] # AOT ID: ['2_inference']
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import torch
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import math
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import random
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import os
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] import tempfile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from math import inf, nan
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch import device, empty_strided
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] aten = torch.ops.aten
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] async_compile = AsyncCompile()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] cpp_fused_constant_pad_nd_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'int64_t*', 'float*', 'int64_t*'], '''
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] extern "C" void kernel(const int64_t* in_ptr0,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] const float* in_ptr1,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] const int64_t* in_ptr2,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] int64_t* out_ptr0,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] float* out_ptr1,
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] int64_t* out_ptr2)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr0 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] tmp10.store(out_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp7 = tmp5.template cast<float,1>().template loadu<float,1>(in_ptr1 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return at::vec::Vectorized<float>(static_cast<float>(0.0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp9 = at::vec::Vectorized<float>(static_cast<float>(0.0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<float,1>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] tmp10.store(out_ptr1 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(16L))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp0 = x0;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp1 = c10::convert<int32_t>(tmp0);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp2 = at::vec::Vectorized<int32_t>::arange(tmp1, 1);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp3 = static_cast<int32_t>(819);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp4 = at::vec::Vectorized<int32_t>(tmp3);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp5 = at::vec::VecMask<int32_t,1>(tmp2 < tmp4);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp6 = [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp7 = tmp5.template cast<float,1>().template loadu<int64_t,2>(in_ptr2 + static_cast<long>(x0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return tmp7;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp10 =
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] [&]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] if (tmp5.all_zero())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] else
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] {
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp8 = tmp6();
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] auto tmp9 = at::vec::VectorizedN<int64_t,2>(static_cast<int64_t>(0));
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return decltype(tmp8)::blendv(tmp9, tmp8, tmp5.template cast<int64_t,2>());
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ;
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] tmp10.store(out_ptr2 + static_cast<long>(x0), 16);
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] }
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] ''')
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] async_compile.wait(globals())
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] del async_compile
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] def call(args):
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] arg0_1, arg1_1, arg2_1 = args
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] args.clear()
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] assert_size_stride(arg0_1, (1, 819), (819, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] assert_size_stride(arg1_1, (1, 819), (819, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] assert_size_stride(arg2_1, (1, 819), (4096, 1))
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] buf0 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] buf1 = empty_strided_cpu((1, 832), (832, 1), torch.float32)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] buf2 = empty_strided_cpu((1, 832), (832, 1), torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] cpp_fused_constant_pad_nd_0(arg0_1, arg1_1, arg2_1, buf0, buf1, buf2)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] del arg0_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] del arg1_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] del arg2_1
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return (buf0, buf1, buf2, )
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._dynamo.testing import rand_strided
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.utils import print_performance
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] arg0_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] arg1_1 = rand_strided((1, 819), (819, 1), device='cpu', dtype=torch.float32)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] arg2_1 = rand_strided((1, 819), (4096, 1), device='cpu', dtype=torch.int64)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1])
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] if __name__ == "__main__":
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code] compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:32.543000 140616046391680 torch/_inductor/graph.py:1681] [4/0] [__output_code]
V0614 00:48:33.603000 140616046391680 torch/_inductor/graph.py:1714] [4/0] Output code written to: /tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py
I0614 00:48:33.603000 140616046391680 torch/_inductor/graph.py:1715] [4/0] [__output_code] Output code written to: /tmp/torchinductor_leslie/fj/cfjbx53t7urpjz2ng6piwvoj4vsjiqa2xrgjcrh4vovq4w45eywv.py
V0614 00:48:33.604000 140616046391680 torch/_inductor/compile_fx.py:531] [4/0] FX codegen and compilation took 1.097s
I0614 00:48:33.604000 140616046391680 torch/_dynamo/logging.py:56] [4/0] Step 3: torchinductor done compiling FORWARDS graph 2
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] TRACED GRAPH
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] ===== FROZEN GRAPH =====
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] def forward(self, arg6_1: "i64[1, 832]", arg7_1: "f32[1, 832]", arg8_1: "i64[1, 832]"):
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # No stacktrace found for following nodes
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] _frozen_param0: "f32[50358, 768]" = self._frozen_param0
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] _frozen_param1: "f32[2, 768]" = self._frozen_param1
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] _frozen_param3: "f32[768]" = self._frozen_param3
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] _frozen_param4: "f32[768]" = self._frozen_param4
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] view: "f32[1, 13, 64]" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] slice_2: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] slice_4: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] slice_6: "f32[1, 9, 64]" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] cat: "f32[1, 9, 192]" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_6 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] unsqueeze: "f32[1, 9, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_4, 3); slice_4 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] permute: "f32[1, 9, 64, 1]" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] unsqueeze_1: "f32[1, 9, 192, 1]" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] permute_1: "f32[1, 9, 1, 192]" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] mul: "f32[1, 9, 64, 192]" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] unsqueeze_2: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] view_1: "f32[1, 1, 832, 1]" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] view_2: "f32[1, 1, 1, 832]" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] embedding: "f32[1, 832, 768]" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0); _frozen_param0 = arg6_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] embedding_1: "f32[1, 832, 768]" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1); _frozen_param1 = arg8_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] add: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] _frozen_param6: "f32[1, 832, 768]" = self._frozen_param6
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] add_1: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(add, _frozen_param6); add = _frozen_param6 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] getitem: "f32[1, 832, 1]" = var_mean[0]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] getitem_1: "f32[1, 832, 1]" = var_mean[1]; var_mean = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] add_2: "f32[1, 832, 1]" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] rsqrt: "f32[1, 832, 1]" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] sub: "f32[1, 832, 768]" = torch.ops.aten.sub.Tensor(add_1, getitem_1); add_1 = getitem_1 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] mul_1: "f32[1, 832, 768]" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] mul_2: "f32[1, 832, 768]" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] add_3: "f32[1, 832, 768]" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4); mul_2 = _frozen_param4 = None
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1] return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
V0614 00:48:33.836000 140616046391680 torch/_inductor/freezing.py:118] [5/0_1]
I0614 00:48:33.839000 140616046391680 torch/_dynamo/logging.py:56] [5/0_1] Step 3: torchinductor compiling FORWARDS graph 3
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] def forward(self, arg6_1: "i64[1, 832][832, 1]cpu", arg7_1: "f32[1, 832][832, 1]cpu", arg8_1: "i64[1, 832][832, 1]cpu"):
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # No stacktrace found for following nodes
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] _frozen_param0: "f32[50358, 768][768, 1]cpu" = self._frozen_param0
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] _frozen_param1: "f32[2, 768][768, 1]cpu" = self._frozen_param1
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] _frozen_param3: "f32[768][1]cpu" = self._frozen_param3
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] _frozen_param4: "f32[768][1]cpu" = self._frozen_param4
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:305 in forward, code: position_embeddings = self.position_embeddings(position_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] _frozen_param6: "f32[1, 832, 768][638976, 768, 1]cpu" = self._frozen_param6
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:296 in forward, code: inputs_embeds = self.word_embeddings(input_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] embedding: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param0, arg6_1, 0); _frozen_param0 = arg6_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:301 in forward, code: token_type_embeddings = self.token_type_embeddings(token_type_ids)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] embedding_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.embedding.default(_frozen_param1, arg8_1); _frozen_param1 = arg8_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:303 in forward, code: embeddings = inputs_embeds + token_type_embeddings
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] add: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(embedding, embedding_1); embedding = embedding_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:306 in forward, code: embeddings += position_embeddings
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] add_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(add, _frozen_param6); add = _frozen_param6 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:309 in forward, code: embeddings = self.LayerNorm(embeddings)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] var_mean = torch.ops.aten.var_mean.correction(add_1, [2], correction = 0, keepdim = True)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] getitem: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[0]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] getitem_1: "f32[1, 832, 1][832, 1, 1]cpu" = var_mean[1]; var_mean = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] sub: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.sub.Tensor(add_1, getitem_1); add_1 = getitem_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] add_2: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.add.Tensor(getitem, 1e-12); getitem = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] rsqrt: "f32[1, 832, 1][832, 1, 1]cpu" = torch.ops.aten.rsqrt.default(add_2); add_2 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] mul_1: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(sub, rsqrt); sub = rsqrt = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] mul_2: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.mul.Tensor(mul_1, _frozen_param3); mul_1 = _frozen_param3 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] add_3: "f32[1, 832, 768][638976, 768, 1]cpu" = torch.ops.aten.add.Tensor(mul_2, _frozen_param4); mul_2 = _frozen_param4 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2200 in create_masks_for_block_sparse_attn, code: blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] view: "f32[1, 13, 64][832, 64, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 13, 64])
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] slice_4: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 2, -2)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] unsqueeze: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_4, 3)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] permute: "f32[1, 9, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze, [0, 1, 2, 3]); unsqueeze = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2194 in create_band_mask_from_inputs, code: [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] slice_2: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 1, -3)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] slice_6: "f32[1, 9, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view, 1, 3, -1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2193 in create_band_mask_from_inputs, code: exp_blocked_to_pad = torch.cat(
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] cat: "f32[1, 9, 192][1728, 192, 1]cpu" = torch.ops.aten.cat.default([slice_2, slice_4, slice_6], 2); slice_2 = slice_4 = slice_6 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2196 in create_band_mask_from_inputs, code: band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] unsqueeze_1: "f32[1, 9, 192, 1][1728, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(cat, 3); cat = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] permute_1: "f32[1, 9, 1, 192][1728, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_1, [0, 1, 3, 2]); unsqueeze_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] mul: "f32[1, 9, 64, 192][110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2197 in create_band_mask_from_inputs, code: band_mask.unsqueeze_(1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] unsqueeze_2: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.unsqueeze.default(mul, 1); mul = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2203 in create_masks_for_block_sparse_attn, code: from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] view_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 832, 1])
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:2204 in create_masks_for_block_sparse_attn, code: to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] view_2: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.reshape.default(arg7_1, [1, 1, 1, 832]); arg7_1 = None
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs] return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.856000 140616046391680 torch/_inductor/compile_fx.py:748] [5/0_1] [__post_grad_graphs]
V0614 00:48:33.858000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param0 : [num_users=1] = get_attr[target=_frozen_param0]
V0614 00:48:33.859000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param1 : [num_users=1] = get_attr[target=_frozen_param1]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param3 : [num_users=1] = get_attr[target=_frozen_param3]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param4 : [num_users=1] = get_attr[target=_frozen_param4]
V0614 00:48:33.860000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg7_1 : [num_users=3] = placeholder[target=arg7_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:33.861000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %_frozen_param6 : [num_users=1] = get_attr[target=_frozen_param6]
V0614 00:48:33.862000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %embedding : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_frozen_param0, %arg6_1, 0), kwargs = {})
V0614 00:48:33.862000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function embedding at 0x7fe21799c790>
V0614 00:48:33.863000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %embedding_1 : [num_users=1] = call_function[target=torch.ops.aten.embedding.default](args = (%_frozen_param1, %arg8_1), kwargs = {})
V0614 00:48:33.863000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function embedding at 0x7fe21799c790>
V0614 00:48:33.864000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%embedding, %embedding_1), kwargs = {})
V0614 00:48:33.864000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.866000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_1 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add, %_frozen_param6), kwargs = {})
V0614 00:48:33.866000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.868000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %var_mean : [num_users=2] = call_function[target=torch.ops.aten.var_mean.correction](args = (%add_1, [2]), kwargs = {correction: 0, keepdim: True})
V0614 00:48:33.868000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function var_mean at 0x7fe216564820>
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%var_mean, 0), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%var_mean, 1), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_1, %getitem_1), kwargs = {})
V0614 00:48:33.870000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:33.873000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%getitem, 1e-12), kwargs = {})
V0614 00:48:33.873000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.874000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %rsqrt : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {})
V0614 00:48:33.874000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function rsqrt at 0x7fe216565900>
V0614 00:48:33.875000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, %rsqrt), kwargs = {})
V0614 00:48:33.875000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function mul at 0x7fe2165653f0>
V0614 00:48:33.877000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_1, %_frozen_param3), kwargs = {})
V0614 00:48:33.877000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function mul at 0x7fe2165653f0>
V0614 00:48:33.879000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %add_3 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_2, %_frozen_param4), kwargs = {})
V0614 00:48:33.879000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:33.881000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view : [num_users=4] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 13, 64]), kwargs = {})
V0614 00:48:33.881000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:33.882000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_4 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 2, -2), kwargs = {})
V0614 00:48:33.882000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.883000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_4, 3), kwargs = {})
V0614 00:48:33.883000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze, [0, 1, 2, 3]), kwargs = {})
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function permute at 0x7fe2179c3370>
V0614 00:48:33.884000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 1, -3), kwargs = {})
V0614 00:48:33.885000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.886000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %slice_6 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view, 1, 3, -1), kwargs = {})
V0614 00:48:33.886000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:33.887000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_2, %slice_4, %slice_6], 2), kwargs = {})
V0614 00:48:33.887000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function cat at 0x7fe2179c40d0>
V0614 00:48:33.888000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze_1 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%cat, 3), kwargs = {})
V0614 00:48:33.888000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_1, [0, 1, 3, 2]), kwargs = {})
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function permute at 0x7fe2179c3370>
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%permute, %permute_1), kwargs = {})
V0614 00:48:33.889000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function mul at 0x7fe2165653f0>
V0614 00:48:33.892000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %unsqueeze_2 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%mul, 1), kwargs = {})
V0614 00:48:33.892000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:33.894000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 1, 832, 1]), kwargs = {})
V0614 00:48:33.894000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg7_1, [1, 1, 1, 832]), kwargs = {})
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:976] [5/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:33.895000 140616046391680 torch/_inductor/graph.py:1173] [5/0_1] lowering return (add_3, unsqueeze_2, view_1, view_2, view)
V0614 00:48:33.898000 140616046391680 torch/_inductor/graph.py:1097] [5/0_1] Force channels last inputs for 0 conv for the current graph with id 3
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg6_1, i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp1 = ops.load(_frozen_param0, i2 + 768 * tmp0)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp2 = ops.load(arg8_1, i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp3 = ops.load(_frozen_param1, i2 + 768 * tmp2)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp4 = tmp1 + tmp3
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp5 = ops.load(_frozen_param6, i2 + 768 * i1)
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp6 = tmp4 + tmp5
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp6
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 832, 768],
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=add_1,
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={embedding, add, embedding_1, add_1}
V0614 00:48:33.953000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index, rindex):
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, _ = index
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] r0 = rindex
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 832, 1],
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_ranges=[768],
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_type=welford_reduce,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=getitem_1,
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={var_mean}
V0614 00:48:33.954000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index, rindex):
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, _ = index
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] r0 = rindex
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 832, 1],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_ranges=[768],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_type=welford_reduce,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={var_mean}
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf3', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 1], stride=[832, 1, 832]), data=WelfordReduction(
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index, rindex):
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, _ = index
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] r0 = rindex
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(buf0, r0 + 768 * i1)
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 832, 1],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_ranges=[768],
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] reduction_type=welford_reduce,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={var_mean}
V0614 00:48:33.955000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.float32, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(buf0, i2 + 768 * i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp1 = ops.load(buf1, i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp2 = tmp0 - tmp1
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp3 = ops.load(buf2, i1)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp4 = ops.constant(0, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp5 = ops.constant(768, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp6 = ops.constant(0, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp7 = tmp5 - tmp4
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp8 = ops.maximum(tmp6, tmp7)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp9 = tmp3 / tmp8
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp10 = ops.constant(1e-12, torch.float32)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp11 = tmp9 + tmp10
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp12 = ops.rsqrt(tmp11)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp13 = tmp2 * tmp12
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp14 = ops.load(_frozen_param3, i2)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp15 = tmp13 * tmp14
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp16 = ops.load(_frozen_param4, i2)
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp17 = tmp15 + tmp16
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp17
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 832, 768],
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=add_3,
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={mul_1, mul_2, rsqrt, add_2, var_mean, sub, add_3}
V0614 00:48:33.956000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf5', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 64 + i2 + 64 * i1)
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.957000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 192 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ConcatKernel(name='buf8', layout=FixedLayout('cpu', torch.float32, size=[1, 9, 192], stride=[1728, 192, 1]), inputs=[ComputedBuffer(name='buf5', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 64 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] )), ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] )), ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.float32, size=[1, 9, 64], stride=[1728, 192, 1]), data=Pointwise(
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2 = index
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 192 + i2 + 64 * i1)
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp0
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64],
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=None,
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={cat}
V0614 00:48:33.958000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))])
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] scheduling ComputedBuffer(name='buf9', layout=FixedLayout('cpu', torch.float32, size=[1, 9, 64, 192], stride=[110592, 12288, 192, 1]), data=Pointwise(
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] 'cpu',
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] torch.float32,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] def inner_fn(index):
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] _, i1, i2, i3 = index
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp0 = ops.load(arg7_1, 128 + i2 + 64 * i1)
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp1 = ops.load(buf8, i3 + 192 * i1)
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] tmp2 = tmp0 * tmp1
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] return tmp2
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ranges=[1, 9, 64, 192],
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origin_node=mul,
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] origins={mul}
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1601] [5/0_1] ))
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output buf4
V0614 00:48:33.959000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output buf9
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1671] [5/0_1] scheduling output arg7_1
V0614 00:48:33.960000 140616046391680 torch/_inductor/scheduler.py:1764] [5/0_1] removed dead node: buf3
V0614 00:48:33.962000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf0_buf1_buf2_buf4 with estimated runtime 0.000000
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=8] = placeholder[target=ops]
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg6_1, %get_index), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %set_indirect0 : [num_users=0] = call_module[target=set_indirect0](args = (%load,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_1 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param0, %get_index_1), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_2 : [num_users=1] = call_method[target=load](args = (%ops, arg8_1, %get_index_2), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %set_indirect1 : [num_users=0] = call_module[target=set_indirect1](args = (%load_2,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_3 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_3 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param1, %get_index_3), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %add : [num_users=1] = call_method[target=add](args = (%ops, %load_1, %load_3), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_4 : [num_users=1] = call_module[target=get_index](args = (index3,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_4 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param6, %get_index_4), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %add_1 : [num_users=1] = call_method[target=add](args = (%ops, %add, %load_4), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_5 : [num_users=1] = call_module[target=get_index](args = (index3,), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_5, %add_1, None), kwargs = {})
V0614 00:48:33.964000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %reduction : [num_users=3] = call_method[target=reduction](args = (%ops, torch.float32, torch.float32, welford_reduce, %load), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem : [num_users=1] = call_function[target=operator.getitem](args = (%reduction, 0), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem_1 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 1), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 2), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store_reduction : [num_users=1] = call_method[target=store_reduction](args = (%ops, buf1, %get_index_1, %getitem), kwargs = {})
V0614 00:48:33.970000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store_reduction
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %reduction : [num_users=3] = call_method[target=reduction](args = (%ops, torch.float32, torch.float32, welford_reduce, %load), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 0), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem_1 : [num_users=1] = call_function[target=operator.getitem](args = (%reduction, 1), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %getitem_2 : [num_users=0] = call_function[target=operator.getitem](args = (%reduction, 2), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store_reduction : [num_users=1] = call_method[target=store_reduction](args = (%ops, buf2, %get_index_1, %getitem_1), kwargs = {})
V0614 00:48:33.971000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store_reduction
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=15] = placeholder[target=ops]
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, buf0, %get_index), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_1 : [num_users=1] = call_method[target=load](args = (%ops, buf1, %get_index_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %sub : [num_users=1] = call_method[target=sub](args = (%ops, %load, %load_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_2 : [num_users=1] = call_method[target=load](args = (%ops, buf2, %get_index_2), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %constant : [num_users=1] = call_method[target=constant](args = (%ops, 768.0, torch.float32), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %truediv : [num_users=1] = call_method[target=truediv](args = (%ops, %load_2, %constant), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %constant_1 : [num_users=1] = call_method[target=constant](args = (%ops, 1e-12, torch.float32), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %add : [num_users=1] = call_method[target=add](args = (%ops, %truediv, %constant_1), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %rsqrt : [num_users=1] = call_method[target=rsqrt](args = (%ops, %add), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %mul : [num_users=1] = call_method[target=mul](args = (%ops, %sub, %rsqrt), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_3 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_3 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param3, %get_index_3), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %mul_1 : [num_users=1] = call_method[target=mul](args = (%ops, %mul, %load_3), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_4 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_4 : [num_users=1] = call_method[target=load](args = (%ops, _frozen_param4, %get_index_4), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %add_1 : [num_users=1] = call_method[target=add](args = (%ops, %mul_1, %load_4), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_5 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf4, %get_index_5, %add_1, None), kwargs = {})
V0614 00:48:34.107000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:34.118000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf5 with estimated runtime 0.000000
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf5, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.118000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:34.147000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf6 with estimated runtime 0.000000
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf6, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.147000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:34.164000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf7 with estimated runtime 0.000000
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=2] = placeholder[target=ops]
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf7, %get_index_1, %load, None), kwargs = {})
V0614 00:48:34.164000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:34.180000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf8 with estimated runtime 0.000000
V0614 00:48:34.180000 140616046391680 torch/_inductor/scheduler.py:2688] [5/0_1] Generating code for node buf9 with estimated runtime 0.000000
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] get_bounds:
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] graph():
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %ops : [num_users=4] = placeholder[target=ops]
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg7_1, %get_index), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index1,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %load_1 : [num_users=1] = call_method[target=load](args = (%ops, buf8, %get_index_1), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %mul : [num_users=1] = call_method[target=mul](args = (%ops, %load, %load_1), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %get_index_2 : [num_users=1] = call_module[target=get_index](args = (index2,), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf9, %get_index_2, %mul, None), kwargs = {})
V0614 00:48:34.180000 140616046391680 torch/_inductor/bounds.py:63] [5/0_1] return store
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] Output code:
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] # AOT ID: ['3_inference']
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import torch
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import math
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import random
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import os
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] import tempfile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from math import inf, nan
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param0 = None # device(type='cpu') torch.float32 (50358, 768) (768, 1) 7fe22c324c70
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param1 = None # device(type='cpu') torch.float32 (2, 768) (768, 1) 7fe22c324bd0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param3 = None # device(type='cpu') torch.float32 (768,) (1,) 7fe22c324b30
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param4 = None # device(type='cpu') torch.float32 (768,) (1,) 7fe22c324b80
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param6 = None # device(type='cpu') torch.float32 (1, 832, 768) (638976, 768, 1) 7fe1827beca0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] cpp_fused_add_cat_embedding_mul_native_layer_norm_0 = async_compile.cpp_pybinding(['const int64_t*', 'const float*', 'const int64_t*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'const float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*', 'float*'], '''
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] extern "C" void kernel(const int64_t* in_ptr0,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr1,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const int64_t* in_ptr2,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr3,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr4,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr5,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr6,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr7,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] const float* in_ptr8,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr0,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr1,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr2,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr3,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr4,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr5,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr6,
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] float* out_ptr7)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp parallel num_threads(56)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] int tid = omp_get_thread_num();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp for
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(832L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] Welford<float> tmp_acc0 = Welford<float>();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] Welford<at::vec::Vectorized<float>> tmp_acc0_vec = Welford<at::vec::Vectorized<float>>();
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] static WeightRecp<at::vec::Vectorized<float>> weight_recps(static_cast<long>(48L));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = in_ptr0[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp10 = in_ptr2[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp21 = at::vec::Vectorized<float>::loadu(in_ptr4 + static_cast<long>(x1 + (768L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp1 = 50358L;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp2 = c10::convert<int64_t>(tmp1);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp3 = decltype(tmp0)(tmp0 + tmp2);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp4 = tmp0 < 0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp5 = tmp4 ? tmp3 : tmp0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp6 = tmp5;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp7 = c10::convert<int64_t>(tmp6);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] TORCH_CHECK((0 <= tmp7) & (tmp7 < 50358L), "index out of bounds: 0 <= tmp7 < 50358L");
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp9 = at::vec::Vectorized<float>::loadu(in_ptr1 + static_cast<long>(x1 + (768L*tmp5)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp11 = 2L;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp12 = c10::convert<int64_t>(tmp11);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp13 = decltype(tmp10)(tmp10 + tmp12);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp14 = tmp10 < 0;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp15 = tmp14 ? tmp13 : tmp10;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp16 = tmp15;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp17 = c10::convert<int64_t>(tmp16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] TORCH_CHECK((0 <= tmp17) & (tmp17 < 2L), "index out of bounds: 0 <= tmp17 < 2L");
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp19 = at::vec::Vectorized<float>::loadu(in_ptr3 + static_cast<long>(x1 + (768L*tmp15)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp20 = tmp9 + tmp19;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp22 = tmp20 + tmp21;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp22.store(out_ptr0 + static_cast<long>(x1 + (768L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp_acc0_vec = welford_combine(tmp_acc0_vec, tmp22, &weight_recps);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp_acc0 = welford_combine(tmp_acc0, welford_vec_reduce_all(tmp_acc0_vec));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] out_ptr1[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.mean);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] out_ptr2[static_cast<long>(x0)] = static_cast<float>(tmp_acc0.m2);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(768L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = at::vec::Vectorized<float>::loadu(out_ptr0 + static_cast<long>(x1 + (768L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp1 = out_ptr1[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp4 = out_ptr2[static_cast<long>(x0)];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp12 = at::vec::Vectorized<float>::loadu(in_ptr5 + static_cast<long>(x1), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp14 = at::vec::Vectorized<float>::loadu(in_ptr6 + static_cast<long>(x1), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp2 = at::vec::Vectorized<float>(tmp1);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp3 = tmp0 - tmp2;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp5 = static_cast<float>(768.0);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp6 = tmp4 / tmp5;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp7 = static_cast<float>(1e-12);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp8 = decltype(tmp6)(tmp6 + tmp7);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp9 = 1 / std::sqrt(tmp8);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp10 = at::vec::Vectorized<float>(tmp9);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp11 = tmp3 * tmp10;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp13 = tmp11 * tmp12;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp15 = tmp13 + tmp14;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp15.store(out_ptr3 + static_cast<long>(x1 + (768L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(64L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp0.store(out_ptr4 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(128L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp0.store(out_ptr5 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr7 + static_cast<long>(192L + x1 + (64L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp0.store(out_ptr6 + static_cast<long>(x1 + (192L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma omp single
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(9L); x0+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] #pragma GCC ivdep
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x1=static_cast<long>(0L); x1<static_cast<long>(64L); x1+=static_cast<long>(1L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] for(long x2=static_cast<long>(0L); x2<static_cast<long>(192L); x2+=static_cast<long>(16L))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] {
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp0 = in_ptr7[static_cast<long>(128L + x1 + (64L*x0))];
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp1 = at::vec::Vectorized<float>::loadu(in_ptr8 + static_cast<long>(x2 + (192L*x0)), 16);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp2 = at::vec::Vectorized<float>(tmp0);
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] auto tmp3 = tmp2 * tmp1;
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] tmp3.store(out_ptr7 + static_cast<long>(x2 + (192L*x1) + (12288L*x0)));
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] }
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] ''')
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] del async_compile
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] def call(args):
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] arg6_1, arg7_1, arg8_1 = args
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] args.clear()
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] assert_size_stride(arg6_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] assert_size_stride(arg7_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] assert_size_stride(arg8_1, (1, 832), (832, 1))
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf1 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf2 = empty_strided_cpu((1, 832, 1), (832, 1, 832), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf4 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf8 = empty_strided_cpu((1, 9, 192), (1728, 192, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf5 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 0) # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf6 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 64) # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf7 = reinterpret_tensor(buf8, (1, 9, 64), (1728, 192, 1), 128) # alias
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] buf9 = empty_strided_cpu((1, 9, 64, 192), (110592, 12288, 192, 1), torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] cpp_fused_add_cat_embedding_mul_native_layer_norm_0(arg6_1, _frozen_param0, arg8_1, _frozen_param1, _frozen_param6, _frozen_param3, _frozen_param4, arg7_1, buf8, buf0, buf1, buf2, buf4, buf5, buf6, buf7, buf9)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] del arg6_1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] del arg8_1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] return (buf4, reinterpret_tensor(buf9, (1, 1, 9, 64, 192), (110592, 110592, 12288, 192, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 832, 1), (832, 832, 1, 1), 0), reinterpret_tensor(arg7_1, (1, 1, 1, 832), (832, 832, 832, 1), 0), reinterpret_tensor(arg7_1, (1, 13, 64), (832, 64, 1), 0), )
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._dynamo.testing import rand_strided
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.utils import print_performance
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] global _frozen_param0
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param0 = rand_strided((50358, 768), (768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] global _frozen_param1
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param1 = rand_strided((2, 768), (768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] global _frozen_param3
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param3 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] global _frozen_param4
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param4 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] global _frozen_param6
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] _frozen_param6 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] arg6_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] arg7_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.float32)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] arg8_1 = rand_strided((1, 832), (832, 1), device='cpu', dtype=torch.int64)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] fn = lambda: call([arg6_1, arg7_1, arg8_1])
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code] compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:34.224000 140616046391680 torch/_inductor/graph.py:1681] [5/0_1] [__output_code]
V0614 00:48:35.359000 140616046391680 torch/_inductor/graph.py:1714] [5/0_1] Output code written to: /tmp/torchinductor_leslie/qn/cqnbrpdzwchbnkizgxa644pd2tr2v3yro2b26pyq2apqsqovh6ej.py
I0614 00:48:35.359000 140616046391680 torch/_inductor/graph.py:1715] [5/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/qn/cqnbrpdzwchbnkizgxa644pd2tr2v3yro2b26pyq2apqsqovh6ej.py
V0614 00:48:35.360000 140616046391680 torch/_inductor/compile_fx.py:531] [5/0_1] FX codegen and compilation took 1.522s
I0614 00:48:35.360000 140616046391680 torch/_dynamo/logging.py:56] [5/0_1] Step 3: torchinductor done compiling FORWARDS graph 3
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] TRACED GRAPH
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] ===== FROZEN GRAPH =====
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] def forward(self, arg0_1: "f32[1, 1, 9, 64, 192]", arg1_1: "f32[1, 1, 832, 1]", arg2_1: "f32[1, 1, 1, 832]"):
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1] return (arg0_1, arg1_1, arg2_1)
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]
V0614 00:48:35.553000 140616046391680 torch/_inductor/freezing.py:118] [8/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] TRACED GRAPH
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] ===== FROZEN GRAPH =====
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] def forward(self, arg6_1: "f32[1, 832, 768]"):
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param6: "bf16[768]" = self._frozen_param6
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] convert_element_type_2: "bf16[1, 832, 768]" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param12 = self._frozen_param12
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _linear_pointwise_default_5: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] view_2: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] permute_1: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param8: "bf16[768]" = self._frozen_param8
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param13 = self._frozen_param13
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _linear_pointwise_default_4: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] view_5: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] permute_3: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param10: "bf16[768]" = self._frozen_param10
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # No stacktrace found for following nodes
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _frozen_param14 = self._frozen_param14
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] _linear_pointwise_default_3: "bf16[1, 832, 768]" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] view_8: "bf16[1, 832, 12, 64]" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] permute_5: "bf16[1, 12, 832, 64]" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1] return (permute_1, permute_3, permute_5)
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
V0614 00:48:35.699000 140616046391680 torch/_inductor/freezing.py:118] [9/0_1]
I0614 00:48:35.701000 140616046391680 torch/_dynamo/logging.py:56] [9/0_1] Step 3: torchinductor compiling FORWARDS graph 5
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] def forward(self, arg6_1: "f32[1, 832, 768][638976, 768, 1]cpu"):
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param6: "bf16[768][1]cpu" = self._frozen_param6
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param12: "bf16[768, 768][1, 0]cpu" = self._frozen_param12
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param8: "bf16[768][1]cpu" = self._frozen_param8
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param13: "bf16[768, 768][1, 0]cpu" = self._frozen_param13
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param10: "bf16[768][1]cpu" = self._frozen_param10
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # No stacktrace found for following nodes
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _frozen_param14: "bf16[768, 768][1, 0]cpu" = self._frozen_param14
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:468 in forward, code: query_layer = self.transpose_for_scores(self.query(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] convert_element_type_2: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.prims.convert_element_type.default(arg6_1, torch.bfloat16); arg6_1 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _linear_pointwise_default_5: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param12, _frozen_param6, 'none', [], ''); _frozen_param12 = _frozen_param6 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] view_2: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_5, [1, 832, 12, 64]); _linear_pointwise_default_5 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] permute_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_2, [0, 2, 1, 3]); view_2 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:469 in forward, code: key_layer = self.transpose_for_scores(self.key(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _linear_pointwise_default_4: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param13, _frozen_param8, 'none', [], ''); _frozen_param13 = _frozen_param8 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] view_5: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_4, [1, 832, 12, 64]); _linear_pointwise_default_4 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] permute_3: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_5, [0, 2, 1, 3]); view_5 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:470 in forward, code: value_layer = self.transpose_for_scores(self.value(hidden_states))
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] _linear_pointwise_default_3: "bf16[1, 832, 768][638976, 768, 1]cpu" = torch.ops.mkldnn._linear_pointwise.default(convert_element_type_2, _frozen_param14, _frozen_param10, 'none', [], ''); convert_element_type_2 = _frozen_param14 = _frozen_param10 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:443 in transpose_for_scores, code: x = x.view(*new_x_shape)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] view_8: "bf16[1, 832, 12, 64][638976, 768, 64, 1]cpu" = torch.ops.aten.reshape.default(_linear_pointwise_default_3, [1, 832, 12, 64]); _linear_pointwise_default_3 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:444 in transpose_for_scores, code: return x.permute(0, 2, 1, 3)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] permute_5: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.permute.default(view_8, [0, 2, 1, 3]); view_8 = None
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs] return (permute_1, permute_3, permute_5)
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.712000 140616046391680 torch/_inductor/compile_fx.py:748] [9/0_1] [__post_grad_graphs]
V0614 00:48:35.713000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:35.714000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param6 : [num_users=1] = get_attr[target=_frozen_param6]
V0614 00:48:35.714000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param12 : [num_users=1] = get_attr[target=_frozen_param12]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param8 : [num_users=1] = get_attr[target=_frozen_param8]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param13 : [num_users=1] = get_attr[target=_frozen_param13]
V0614 00:48:35.715000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param10 : [num_users=1] = get_attr[target=_frozen_param10]
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_frozen_param14 : [num_users=1] = get_attr[target=_frozen_param14]
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %convert_element_type_2 : [num_users=3] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%arg6_1, torch.bfloat16), kwargs = {})
V0614 00:48:35.716000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_5 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param12, %_frozen_param6, none, [], ), kwargs = {})
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_5, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.718000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:35.719000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_2, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.719000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function permute at 0x7fe2179c3370>
V0614 00:48:35.720000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_4 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param13, %_frozen_param8, none, [], ), kwargs = {})
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_5 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_4, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.721000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:35.722000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_5, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.722000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function permute at 0x7fe2179c3370>
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %_linear_pointwise_default_3 : [num_users=1] = call_function[target=torch.ops.mkldnn._linear_pointwise.default](args = (%convert_element_type_2, %_frozen_param14, %_frozen_param10, none, [], ), kwargs = {})
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function register_onednn_fusion_ops.<locals>.linear_unary at 0x7fe18485d3f0>
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %view_8 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%_linear_pointwise_default_3, [1, 832, 12, 64]), kwargs = {})
V0614 00:48:35.723000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function view at 0x7fe2179c3130>
V0614 00:48:35.724000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_8, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:35.724000 140616046391680 torch/_inductor/graph.py:976] [9/0_1] via <function permute at 0x7fe2179c3370>
V0614 00:48:35.725000 140616046391680 torch/_inductor/graph.py:1173] [9/0_1] lowering return (permute_1, permute_3, permute_5)
V0614 00:48:35.725000 140616046391680 torch/_inductor/graph.py:1097] [9/0_1] Force channels last inputs for 0 conv for the current graph with id 5
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] 'cpu',
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] torch.bfloat16,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] def inner_fn(index):
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] _, i1, i2 = index
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] return tmp1
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ranges=[1, 832, 768],
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=convert_element_type_2,
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={convert_element_type_2}
V0614 00:48:35.731000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ))
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] name=buf1,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] 'cpu',
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] torch.bfloat16,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] def inner_fn(index):
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] _, i1, i2 = index
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] return tmp1
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ranges=[1, 832, 768],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=convert_element_type_2,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={convert_element_type_2}
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )), ConstantBuffer(name='_frozen_param12', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param6', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] constant_args=['none', [-1], ''],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwargs={},
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] output_view=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] op_overload=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] arg_properties=[{}, {}, {}],
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwarg_properties=None,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] unbacked_bindings={},
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=_linear_pointwise_default_5,
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={_linear_pointwise_default_5}
V0614 00:48:35.732000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] name=buf2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] 'cpu',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] torch.bfloat16,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] def inner_fn(index):
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] _, i1, i2 = index
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] return tmp1
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ranges=[1, 832, 768],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=convert_element_type_2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={convert_element_type_2}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )), ConstantBuffer(name='_frozen_param13', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param8', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] constant_args=['none', [-1], ''],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwargs={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] output_view=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] op_overload=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] arg_properties=[{}, {}, {}],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwarg_properties=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] unbacked_bindings={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=_linear_pointwise_default_4,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={_linear_pointwise_default_4}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] scheduling LinearUnary(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name='torch.ops.mkldnn._linear_pointwise',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] name=buf3,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] inputs=[ComputedBuffer(name='buf0', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 832, 768], stride=[638976, 768, 1]), data=Pointwise(
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] 'cpu',
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] torch.bfloat16,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] def inner_fn(index):
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] _, i1, i2 = index
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp0 = ops.load(arg6_1, i2 + 768 * i1)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] tmp1 = ops.to_dtype(tmp0, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] return tmp1
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ranges=[1, 832, 768],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=convert_element_type_2,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={convert_element_type_2}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )), ConstantBuffer(name='_frozen_param14', layout=FixedLayout('cpu', torch.bfloat16, size=[768, 768], stride=[1, 0])), ConstantBuffer(name='_frozen_param10', layout=FixedLayout('cpu', torch.bfloat16, size=[768], stride=[1]))],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] constant_args=['none', [-1], ''],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwargs={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] output_view=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] python_kernel_name=torch.ops.mkldnn._linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] cpp_kernel_name=mkldnn::_linear_pointwise,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] op_overload=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] arg_properties=[{}, {}, {}],
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] kwarg_properties=None,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] unbacked_bindings={},
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origin_node=_linear_pointwise_default_3,
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] origins={_linear_pointwise_default_3}
V0614 00:48:35.733000 140616046391680 torch/_inductor/scheduler.py:1601] [9/0_1] )
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf1
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf2
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:1671] [9/0_1] scheduling output buf3
V0614 00:48:35.734000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf0 with estimated runtime 0.000000
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] get_bounds:
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] graph():
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %ops : [num_users=3] = placeholder[target=ops]
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %get_index : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %load : [num_users=1] = call_method[target=load](args = (%ops, arg6_1, %get_index), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %get_index_1 : [num_users=1] = call_module[target=get_index](args = (index0,), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %to_dtype_1 : [num_users=1] = call_method[target=to_dtype](args = (%ops, %load, torch.bfloat16), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] %store : [num_users=1] = call_method[target=store](args = (%ops, buf0, %get_index_1, %to_dtype_1, None), kwargs = {})
V0614 00:48:35.735000 140616046391680 torch/_inductor/bounds.py:63] [9/0_1] return store
V0614 00:48:35.738000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf1 with estimated runtime 0.000000
V0614 00:48:35.739000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf2 with estimated runtime 0.000000
V0614 00:48:35.740000 140616046391680 torch/_inductor/scheduler.py:2688] [9/0_1] Generating code for node buf3 with estimated runtime 0.000000
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] Output code:
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] # AOT ID: ['5_inference']
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import torch
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import math
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import random
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import os
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] import tempfile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from math import inf, nan
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch import device, empty_strided
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] aten = torch.ops.aten
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] async_compile = AsyncCompile()
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param6 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181de3b50
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param12 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dfce00
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param8 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181dddcb0
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param13 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dd1b20
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param10 = None # device(type='cpu') torch.bfloat16 (768,) (1,) 7fe181de9800
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param14 = None # device(type='cpu') torch.bfloat16 (768, 768) (1, 0) 7fe181dcbec0
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] cpp_fused__to_copy_0 = async_compile.cpp_pybinding(['const float*', 'bfloat16*'], '''
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] #include "/tmp/torchinductor_leslie/sk/cskh5dx62fglpphcrl6723dnmowdabouerrzy3dmqcngbxwfa7bv.h"
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] extern "C" void kernel(const float* in_ptr0,
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] bfloat16* out_ptr0)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] #pragma omp parallel num_threads(56)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] int tid = omp_get_thread_num();
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] #pragma omp for
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] for(long x0=static_cast<long>(0L); x0<static_cast<long>(638976L); x0+=static_cast<long>(16L))
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] {
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] auto tmp0 = at::vec::Vectorized<float>::loadu(in_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] auto tmp1 = at::vec::convert<bfloat16>(tmp0);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] tmp1.store(out_ptr0 + static_cast<long>(x0), 16);
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] }
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] ''')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] async_compile.wait(globals())
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] del async_compile
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] def call(args):
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] arg6_1, = args
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] args.clear()
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] assert_size_stride(arg6_1, (1, 832, 768), (638976, 768, 1))
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] buf0 = empty_strided_cpu((1, 832, 768), (638976, 768, 1), torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] cpp_fused__to_copy_0(arg6_1, buf0)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] del arg6_1
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] buf1 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param12, _frozen_param6, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] buf2 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param13, _frozen_param8, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] buf3 = torch.ops.mkldnn._linear_pointwise(buf0, _frozen_param14, _frozen_param10, 'none', [-1], '')
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] return (reinterpret_tensor(buf1, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf2, (1, 12, 832, 64), (638976, 64, 768, 1), 0), reinterpret_tensor(buf3, (1, 12, 832, 64), (638976, 64, 768, 1), 0), )
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._dynamo.testing import rand_strided
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.utils import print_performance
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param6
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param6 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param12
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param12 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param8
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param8 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param13
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param13 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param10
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param10 = rand_strided((768, ), (1, ), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] global _frozen_param14
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] _frozen_param14 = rand_strided((768, 768), (1, 0), device='cpu', dtype=torch.bfloat16)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] arg6_1 = rand_strided((1, 832, 768), (638976, 768, 1), device='cpu', dtype=torch.float32)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] fn = lambda: call([arg6_1])
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] if __name__ == "__main__":
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code] compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:35.742000 140616046391680 torch/_inductor/graph.py:1681] [9/0_1] [__output_code]
V0614 00:48:36.780000 140616046391680 torch/_inductor/graph.py:1714] [9/0_1] Output code written to: /tmp/torchinductor_leslie/5n/c5nsnj5quh3sljkcq2l2mq7xpgfj4dvyyyyy2kcwvnmub5t6ujeu.py
I0614 00:48:36.781000 140616046391680 torch/_inductor/graph.py:1715] [9/0_1] [__output_code] Output code written to: /tmp/torchinductor_leslie/5n/c5nsnj5quh3sljkcq2l2mq7xpgfj4dvyyyyy2kcwvnmub5t6ujeu.py
V0614 00:48:36.782000 140616046391680 torch/_inductor/compile_fx.py:531] [9/0_1] FX codegen and compilation took 1.082s
I0614 00:48:36.782000 140616046391680 torch/_dynamo/logging.py:56] [9/0_1] Step 3: torchinductor done compiling FORWARDS graph 5
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] TRACED GRAPH
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] ===== FROZEN GRAPH =====
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] def forward(self):
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1160 in _bigbird_block_rand_mask_with_head, code: plan_block_length = np.array(plan_from_length) // from_block_size
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] _frozen_param0: "i64[2]" = self._frozen_param0
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1] return (_frozen_param0,)
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]
V0614 00:48:36.979000 140616046391680 torch/_inductor/freezing.py:118] [12/0_1]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] TRACED GRAPH
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] ===== FROZEN GRAPH =====
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] def forward(self, arg0_1: "i32[13, 3]", arg1_1: "i32[13, 3]", arg2_1: "i32[13, 3]", arg3_1: "i32[13, 3]", arg4_1: "i32[13, 3]", arg5_1: "i32[13, 3]", arg6_1: "i32[13, 3]", arg7_1: "i32[13, 3]", arg8_1: "i32[13, 3]", arg9_1: "i32[13, 3]", arg10_1: "i32[13, 3]", arg11_1: "i32[13, 3]"):
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_1: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_3: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_5: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_7: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_9: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_11: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_13: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_15: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_17: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_19: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_21: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] slice_23: "i32[11, 3]" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0] return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]
V0614 00:48:37.087000 140616046391680 torch/_inductor/freezing.py:118] [14/0]
I0614 00:48:37.088000 140616046391680 torch/_dynamo/logging.py:56] [14/0] Step 3: torchinductor compiling FORWARDS graph 7
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] def forward(self, arg0_1: "i32[13, 3][3, 1]cpu", arg1_1: "i32[13, 3][3, 1]cpu", arg2_1: "i32[13, 3][3, 1]cpu", arg3_1: "i32[13, 3][3, 1]cpu", arg4_1: "i32[13, 3][3, 1]cpu", arg5_1: "i32[13, 3][3, 1]cpu", arg6_1: "i32[13, 3][3, 1]cpu", arg7_1: "i32[13, 3][3, 1]cpu", arg8_1: "i32[13, 3][3, 1]cpu", arg9_1: "i32[13, 3][3, 1]cpu", arg10_1: "i32[13, 3][3, 1]cpu", arg11_1: "i32[13, 3][3, 1]cpu"):
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1172 in torch_dynamo_resume_in__bigbird_block_rand_mask_with_head_at_1165, code: rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_1: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg0_1, 0, 1, 12); arg0_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_3: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg1_1, 0, 1, 12); arg1_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_5: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg2_1, 0, 1, 12); arg2_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_7: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg3_1, 0, 1, 12); arg3_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_9: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg4_1, 0, 1, 12); arg4_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_11: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg5_1, 0, 1, 12); arg5_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_13: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg6_1, 0, 1, 12); arg6_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_15: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg7_1, 0, 1, 12); arg7_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_17: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg8_1, 0, 1, 12); arg8_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_19: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg9_1, 0, 1, 12); arg9_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_21: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg10_1, 0, 1, 12); arg10_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] slice_23: "i32[11, 3][3, 1]cpu" = torch.ops.aten.slice.Tensor(arg11_1, 0, 1, 12); arg11_1 = None
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs] return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]
V0614 00:48:37.095000 140616046391680 torch/_inductor/compile_fx.py:748] [14/0] [__post_grad_graphs]
V0614 00:48:37.096000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:37.097000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
V0614 00:48:37.098000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg5_1 : [num_users=1] = placeholder[target=arg5_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg7_1 : [num_users=1] = placeholder[target=arg7_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:37.099000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg9_1 : [num_users=1] = placeholder[target=arg9_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg10_1 : [num_users=1] = placeholder[target=arg10_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %arg11_1 : [num_users=1] = placeholder[target=arg11_1]
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_1 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg0_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.100000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.101000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_3 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg1_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_5 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg2_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.102000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.103000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_7 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg3_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.103000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_9 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg4_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.104000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_11 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg5_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_13 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg6_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.105000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.106000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_15 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg7_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.106000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.107000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_17 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg8_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.107000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_19 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg9_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_21 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg10_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.108000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.109000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering %slice_23 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg11_1, 0, 1, 12), kwargs = {})
V0614 00:48:37.109000 140616046391680 torch/_inductor/graph.py:976] [14/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:37.110000 140616046391680 torch/_inductor/graph.py:1173] [14/0] lowering return (slice_1, slice_3, slice_5, slice_7, slice_9, slice_11, slice_13, slice_15, slice_17, slice_19, slice_21, slice_23)
V0614 00:48:37.110000 140616046391680 torch/_inductor/graph.py:1097] [14/0] Force channels last inputs for 0 conv for the current graph with id 7
V0614 00:48:37.111000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg0_1
V0614 00:48:37.111000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg1_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg2_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg3_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg4_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg5_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg6_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg7_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg8_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg9_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg10_1
V0614 00:48:37.112000 140616046391680 torch/_inductor/scheduler.py:1671] [14/0] scheduling output arg11_1
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] Output code:
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] # AOT ID: ['7_inference']
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from ctypes import c_void_p, c_long
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import torch
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import math
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import random
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import os
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] import tempfile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from math import inf, nan
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.hooks import run_intermediate_hooks
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.utils import maybe_profile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.codegen.memory_planning import _align as align
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch import device, empty_strided
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.async_compile import AsyncCompile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.select_algorithm import extern_kernels
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.codegen.multi_kernel import MultiKernelCall
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] aten = torch.ops.aten
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] inductor_ops = torch.ops.inductor
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] _quantized = torch.ops._quantized
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride = torch._C._dynamo.guards.assert_size_stride
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] alloc_from_pool = torch.ops.inductor._alloc_from_pool
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] reinterpret_tensor = torch.ops.inductor._reinterpret_tensor
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] async_compile = AsyncCompile()
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] async_compile.wait(globals())
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] del async_compile
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] def call(args):
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1 = args
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] args.clear()
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg0_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg1_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg2_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg3_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg4_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg5_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg6_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg7_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg8_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg9_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg10_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] assert_size_stride(arg11_1, (13, 3), (3, 1))
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] return (reinterpret_tensor(arg0_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg1_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg2_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg3_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg4_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg5_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg6_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg7_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg8_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg9_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg10_1, (11, 3), (3, 1), 3), reinterpret_tensor(arg11_1, (11, 3), (3, 1), 3), )
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] def benchmark_compiled_module(times=10, repeat=10):
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._dynamo.testing import rand_strided
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.utils import print_performance
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg0_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg1_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg2_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg3_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg4_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg5_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg6_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg7_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg8_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg9_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg10_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] arg11_1 = rand_strided((13, 3), (3, 1), device='cpu', dtype=torch.int32)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1])
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] return print_performance(fn, times=times, repeat=repeat)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] if __name__ == "__main__":
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] from torch._inductor.wrapper_benchmark import compiled_module_main
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code] compiled_module_main('hf_BigBird', benchmark_compiled_module)
V0614 00:48:37.113000 140616046391680 torch/_inductor/graph.py:1681] [14/0] [__output_code]
V0614 00:48:37.115000 140616046391680 torch/_inductor/graph.py:1714] [14/0] Output code written to: /tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py
I0614 00:48:37.115000 140616046391680 torch/_inductor/graph.py:1715] [14/0] [__output_code] Output code written to: /tmp/torchinductor_leslie/ki/ckiwvz5uks7efnnd5ew4d76276bcutmxkkfh6ozi5dgumqv3m2qc.py
V0614 00:48:37.115000 140616046391680 torch/_inductor/compile_fx.py:531] [14/0] FX codegen and compilation took 0.027s
I0614 00:48:37.115000 140616046391680 torch/_dynamo/logging.py:56] [14/0] Step 3: torchinductor done compiling FORWARDS graph 7
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] TRACED GRAPH
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] ===== FROZEN GRAPH =====
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] def forward(self, arg0_1: "i32[11, 3]", arg1_1: "i32[11, 3]", arg2_1: "i32[11, 3]", arg3_1: "i32[11, 3]", arg4_1: "i32[11, 3]", arg5_1: "i32[11, 3]", arg6_1: "i32[11, 3]", arg7_1: "i32[11, 3]", arg8_1: "i32[11, 3]", arg9_1: "i32[11, 3]", arg10_1: "i32[11, 3]", arg11_1: "i32[11, 3]", arg12_1: "bf16[1, 12, 832, 64]", arg13_1: "f32[1, 13, 64]", arg14_1: "bf16[1, 12, 832, 64]", arg15_1: "bf16[1, 12, 832, 64]", arg16_1: "f32[1, 1, 1, 832]", arg17_1: "f32[1, 1, 9, 64, 192]", arg18_1: "f32[1, 1, 832, 1]"):
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat: "i32[132, 3]" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view: "i32[12, 11, 3]" = torch.ops.aten.reshape.default(cat, [12, 11, 3]); cat = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type: "i64[12, 11, 3]" = torch.ops.prims.convert_element_type.default(view, torch.int64); view = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze: "i64[1, 12, 11, 3]" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select: "f32[13, 64]" = torch.ops.aten.select.int(arg13_1, 0, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_1: "i64[12, 11, 3]" = torch.ops.aten.select.int(unsqueeze, 0, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_1: "i64[396]" = torch.ops.aten.reshape.default(select_1, [396]); select_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] index: "f32[396, 64]" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_2: "f32[1, 396, 64]" = torch.ops.aten.reshape.default(index, [1, 396, 64]); index = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_3: "f32[1, 12, 11, 192]" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]); view_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_2: "f32[1, 11, 64]" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1); arg13_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_1: "f32[1, 11, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_2: "f32[1, 11, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute: "f32[1, 1, 11, 64, 1]" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_3: "f32[1, 12, 11, 192, 1]" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_1: "f32[1, 12, 11, 1, 192]" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul: "f32[1, 12, 11, 64, 192]" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_4: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_5: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_6: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] iota: "i64[396]" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div: "i64[396]" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_1: "i64[396]" = torch.ops.aten.mul.Tensor(div, 13); div = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_7: "i64[396]" = torch.ops.aten.reshape.default(unsqueeze, [-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add: "i64[396]" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] clone_2: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_8: "bf16[156, 64, 64]" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]); clone_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] index_1: "bf16[396, 64, 64]" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_9: "bf16[1, 12, 33, 64, 64]" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_10: "bf16[1, 12, 11, 192, 64]" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] clone_3: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_12: "bf16[156, 64, 64]" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]); clone_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] index_2: "bf16[396, 64, 64]" = torch.ops.aten.index.Tensor(view_12, [add]); view_12 = add = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_13: "bf16[1, 12, 33, 64, 64]" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_14: "bf16[1, 12, 11, 192, 64]" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_2: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_15: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]); select_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_16: "bf16[12, 832, 64]" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64]); arg14_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_2: "bf16[12, 64, 832]" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm: "bf16[12, 64, 832]" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_17: "bf16[1, 12, 64, 832]" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]); bmm = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_3: "bf16[1, 12, 64, 832]" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub: "f32[1, 1, 1, 832]" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_4: "f32[1, 1, 1, 832]" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_2: "f32[1, 12, 64, 832]" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] amax: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_2, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_1: "f32[1, 12, 64, 832]" = torch.ops.aten.sub.Tensor(add_2, amax); add_2 = amax = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] exp: "f32[1, 12, 64, 832]" = torch.ops.aten.exp.default(sub_1); sub_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sum_1: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div_2: "f32[1, 12, 64, 832]" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_5: "bf16[1, 12, 64, 832]" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_18: "bf16[12, 64, 832]" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_19: "bf16[12, 832, 64]" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64]); arg15_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_1: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_20: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_4: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_3: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_4: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_5: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_6: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_7: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_10, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_1: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_4 = select_5 = select_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_8: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_9: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_10: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_11: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_12: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_14, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_2: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_9 = select_10 = select_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_13: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, 1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_21: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]); select_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_22: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]); cat_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_3: "bf16[12, 64, 448]" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_2: "bf16[12, 64, 448]" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_23: "bf16[1, 12, 64, 448]" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_30: "f32[1, 1, 1, 192]" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_34: "f32[1, 1, 1, 64]" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] full_default: "f32[1, 1, 1, 192]" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_3: "f32[1, 1, 1, 448]" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3); slice_30 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] full_default_1: "f32[1, 12, 64, 256]" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_14: "f32[1, 12, 64, 192]" = torch.ops.aten.select.int(mul, 2, 0)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_4: "f32[1, 12, 64, 448]" = torch.ops.aten.cat.default([full_default_1, select_14], 3); select_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_5: "bf16[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] minimum: "f32[1, 12, 64, 448]" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_2: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_6: "f32[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_3: "f32[1, 12, 64, 448]" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] amax_1: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_3, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_3: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(add_3, amax_1); add_3 = amax_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] exp_1: "f32[1, 12, 64, 448]" = torch.ops.aten.exp.default(sub_3); sub_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sum_2: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div_3: "f32[1, 12, 64, 448]" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_12: "bf16[1, 12, 64, 448]" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_24: "bf16[12, 64, 448]" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_25: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]); cat_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_3: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_26: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_5: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_39: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_42: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_45: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_5: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_48: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_51: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_54: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_6: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_57: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] clone_4: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_27: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]); clone_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_28: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]); cat_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_4: "bf16[108, 64, 192]" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_4: "bf16[108, 64, 192]" = torch.ops.aten.bmm.default(view_27, permute_4); permute_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_29: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_7: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_60: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] clone_6: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_31: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]); clone_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_5: "bf16[108, 64, 192]" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_5: "bf16[108, 64, 192]" = torch.ops.aten.bmm.default(view_27, permute_5); view_27 = permute_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_32: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_8: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_57, 5); slice_57 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_6: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_7: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_3, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_7: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_8: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_33: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]); permute_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_9: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_34: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]); permute_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_6: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_33, view_34); view_34 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_35: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_10: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_36: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_9: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_10: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_6, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_12: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_14: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_38: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]); permute_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_7: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_33, view_38); view_33 = view_38 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_39: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_15: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_40: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_10: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_4: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_11: "f32[1, 1, 9, 64, 192]" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_4: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_23: "bf16[1, 12, 9, 64, 192]" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_68: "f32[1, 1, 1, 64]" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_12: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.unsqueeze.default(slice_68, 3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_5: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_12: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_5: "f32[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_24: "bf16[1, 12, 9, 64, 64]" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_13: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.unsqueeze.default(slice_34, 3); slice_34 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_6: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_13: "f32[1, 1, 1, 1, 64]" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_6: "f32[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_25: "bf16[1, 12, 9, 64, 64]" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_75: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_7: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_14: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_7: "f32[1, 12, 9, 64, 192]" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_26: "bf16[1, 12, 9, 64, 192]" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_7: "bf16[1, 12, 9, 64, 512]" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_27: "f32[1, 12, 9, 64, 512]" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] amax_2: "f32[1, 12, 9, 64, 1]" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_8: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] exp_2: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.exp.default(sub_8); sub_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sum_3: "f32[1, 12, 9, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div_4: "f32[1, 12, 9, 64, 512]" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_28: "bf16[1, 12, 9, 64, 512]" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_80: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_41: "bf16[108, 64, 192]" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]); slice_80 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_42: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]); cat_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_8: "bf16[108, 64, 64]" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_43: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_85: "bf16[1, 12, 9, 64, 192]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_88: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_44: "bf16[108, 64, 192]" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]); slice_85 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] clone_7: "bf16[1, 12, 9, 192, 64]" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_45: "bf16[108, 192, 64]" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]); clone_7 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_9: "bf16[108, 64, 64]" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_46: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_8: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_47: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]); add_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_48: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_93: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_16: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_15: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_8, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_17: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_18: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_49: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]); permute_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_19: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_50: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]); permute_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_10: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_51: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_20: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_52: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_9: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_53: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]); add_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_54: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_100: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807); convert_element_type_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_21: "bf16[1, 12, 9, 64, 1, 64]" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_18: "bf16[1, 12, 64, 64, 1]" = torch.ops.aten.unsqueeze.default(select_11, 4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1]" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_22: "bf16[1, 12, 1, 1, 64, 64]" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_23: "bf16[12, 9, 64, 64, 1, 1]" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_55: "bf16[12, 576, 64]" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]); permute_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_24: "bf16[12, 64, 1, 64, 1, 1]" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_56: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]); permute_24 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_11: "bf16[12, 576, 64]" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_57: "bf16[12, 9, 64, 1, 1, 64]" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_25: "bf16[1, 12, 9, 64, 64, 1]" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_58: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_10: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_59: "bf16[108, 64, 64]" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]); add_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_60: "bf16[1, 12, 9, 64, 64]" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_20: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_21: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_5, 2, -2); view_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_23: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_10, 2, -1); view_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_8: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2); select_3 = select_20 = select_21 = select_6 = select_23 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_25: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -3)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_26: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_6, 2, -2); view_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_28: "bf16[1, 12, 192, 64]" = torch.ops.aten.select.int(view_14, 2, -1); view_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_9: "bf16[1, 12, 448, 64]" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2); select_8 = select_25 = select_26 = select_11 = select_28 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_29: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, -2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_61: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]); select_29 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_62: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]); cat_8 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_26: "bf16[12, 64, 448]" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_12: "bf16[12, 64, 448]" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_63: "bf16[1, 12, 64, 448]" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] slice_132: "f32[1, 1, 1, 192]" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807); arg16_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_10: "f32[1, 1, 1, 448]" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3); slice_68 = slice_132 = full_default = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_30: "f32[1, 12, 64, 192]" = torch.ops.aten.select.int(mul, 2, -1); mul = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_11: "f32[1, 12, 64, 448]" = torch.ops.aten.cat.default([full_default_1, select_30], 3); full_default_1 = select_30 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_15: "bf16[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] minimum_1: "f32[1, 12, 64, 448]" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_9: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_16: "f32[1, 12, 64, 448]" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_11: "f32[1, 12, 64, 448]" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] amax_3: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_11, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_10: "f32[1, 12, 64, 448]" = torch.ops.aten.sub.Tensor(add_11, amax_3); add_11 = amax_3 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] exp_3: "f32[1, 12, 64, 448]" = torch.ops.aten.exp.default(sub_10); sub_10 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sum_4: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div_5: "f32[1, 12, 64, 448]" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_41: "bf16[1, 12, 64, 448]" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_64: "bf16[12, 64, 448]" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_65: "bf16[12, 448, 64]" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]); cat_9 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_13: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_66: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_20: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] select_31: "bf16[1, 12, 64, 64]" = torch.ops.aten.select.int(view_4, 2, -1); view_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_67: "bf16[12, 64, 64]" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]); select_31 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_14: "bf16[12, 64, 832]" = torch.ops.aten.bmm.default(view_67, permute_2); view_67 = permute_2 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_69: "bf16[1, 12, 64, 832]" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_17: "bf16[1, 12, 64, 832]" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] add_12: "f32[1, 12, 64, 832]" = torch.ops.aten.add.Tensor(mul_17, mul_4); mul_17 = mul_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] amax_4: "f32[1, 12, 64, 1]" = torch.ops.aten.amax.default(add_12, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sub_12: "f32[1, 12, 64, 832]" = torch.ops.aten.sub.Tensor(add_12, amax_4); add_12 = amax_4 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] exp_4: "f32[1, 12, 64, 832]" = torch.ops.aten.exp.default(sub_12); sub_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] sum_5: "f32[1, 12, 64, 1]" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] div_6: "f32[1, 12, 64, 832]" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] convert_element_type_48: "bf16[1, 12, 64, 832]" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_70: "bf16[12, 64, 832]" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] bmm_15: "bf16[12, 64, 64]" = torch.ops.aten.bmm.default(view_70, view_19); view_70 = view_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_72: "bf16[1, 12, 64, 64]" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] unsqueeze_21: "bf16[1, 12, 1, 64, 64]" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] cat_12: "bf16[1, 12, 13, 64, 64]" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] view_73: "bf16[1, 12, 832, 64]" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]); cat_12 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] mul_19: "f32[1, 12, 832, 64]" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] permute_28: "f32[1, 832, 12, 64]" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0] return (permute_28, unsqueeze)
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
V0614 00:48:38.360000 140616046391680 torch/_inductor/freezing.py:118] [15/0]
I0614 00:48:38.377000 140616046391680 torch/_dynamo/logging.py:56] [15/0] Step 3: torchinductor compiling FORWARDS graph 8
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] TRACED GRAPH
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] ===== AFTER POST GRAD =====
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] /localdisk/leslie/torch_inductor_community/pytorch/torch/fx/_lazy_graph_module.py class <lambda>(torch.nn.Module):
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] def forward(self, arg0_1: "i32[11, 3][3, 1]cpu", arg1_1: "i32[11, 3][3, 1]cpu", arg2_1: "i32[11, 3][3, 1]cpu", arg3_1: "i32[11, 3][3, 1]cpu", arg4_1: "i32[11, 3][3, 1]cpu", arg5_1: "i32[11, 3][3, 1]cpu", arg6_1: "i32[11, 3][3, 1]cpu", arg7_1: "i32[11, 3][3, 1]cpu", arg8_1: "i32[11, 3][3, 1]cpu", arg9_1: "i32[11, 3][3, 1]cpu", arg10_1: "i32[11, 3][3, 1]cpu", arg11_1: "i32[11, 3][3, 1]cpu", arg12_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg13_1: "f32[1, 13, 64][832, 64, 1]cpu", arg14_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg15_1: "bf16[1, 12, 832, 64][638976, 64, 768, 1]cpu", arg16_1: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu", arg17_1: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu", arg18_1: "f32[1, 1, 832, 1][832, 832, 1, 1]cpu"):
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:602 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_4: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg12_1, [1, 12, 13, 64, -1]); arg12_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:621 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_2: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_15: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_2, [12, 64, 64]); select_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_16: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [12, 832, 64])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_2: "bf16[12, 64, 832][64, 1, 768]cpu" = torch.ops.aten.permute.default(view_16, [0, 2, 1]); view_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_15, permute_2); view_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm, [1, 12, 64, 832]); bmm = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:623 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product = first_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_3: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_17, 0.125); view_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:624 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg16_1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_4: "f32[1, 1, 1, 832][832, 832, 832, 1]cpu" = torch.ops.aten.mul.Tensor(sub, -10000.0); sub = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_3, mul_4); mul_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:625 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] amax: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_2, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_1: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_2, amax); add_2 = amax = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] exp: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_1); sub_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sum_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div_2: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp, sum_1); exp = sum_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_5: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_2, torch.bfloat16); div_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_18: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_5, [-1, 64, 832]); convert_element_type_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_19: "bf16[12, 832, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [12, 832, 64])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_1: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_18, view_19); view_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_20: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_1, [1, 12, 64, 64]); bmm_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:631 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_4: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_20, 2); view_20 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:661 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_13: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_21: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_13, [12, 64, 64]); select_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:603 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_5: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg14_1, [1, 12, 13, 64, -1]); arg14_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:641 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_3: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:642 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_4: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:643 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, 2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_5: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:644 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_6: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] clone_2: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_5, memory_format = torch.contiguous_format)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_8: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_2, [156, 64, 64]); clone_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:593 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = np.stack(rand_attn, axis=0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat: "i32[132, 3][3, 1]cpu" = torch.ops.aten.cat.default([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1]); arg0_1 = arg1_1 = arg2_1 = arg3_1 = arg4_1 = arg5_1 = arg6_1 = arg7_1 = arg8_1 = arg9_1 = arg10_1 = arg11_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view: "i32[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.reshape.default(cat, [12, 11, 3]); cat = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:594 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.prims.convert_element_type.default(view, torch.int64); view = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:595 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_attn.unsqueeze_(0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze: "i64[1, 12, 11, 3][396, 33, 3, 1]cpu" = torch.ops.aten.unsqueeze.default(convert_element_type, 0); convert_element_type = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_7: "i64[396][1]cpu" = torch.ops.aten.reshape.default(unsqueeze, [-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:972 in torch_gather_b2, code: shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] iota: "i64[396][1]cpu" = torch.ops.prims.iota.default(396, start = 0, step = 1, dtype = torch.int64, device = device(type='cpu'), requires_grad = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:973 in torch_gather_b2, code: indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div: "i64[396][1]cpu" = torch.ops.aten.div.Tensor_mode(iota, 33, rounding_mode = 'floor'); iota = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_1: "i64[396][1]cpu" = torch.ops.aten.mul.Tensor(div, 13); div = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:975 in torch_gather_b2, code: flattened_indices = indices.view(-1) + indices_shift
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add: "i64[396][1]cpu" = torch.ops.aten.add.Tensor(view_7, mul_1); view_7 = mul_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] index_1: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_8, [add]); view_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_9: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_1, [1, 12, 33, 64, 64]); index_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:608 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key = gathered_key.view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_10: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_9, [1, 12, 11, 192, -1]); view_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:645 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_7: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:639 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_key_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_1: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_4, select_5, select_6, select_7], 2); select_4 = select_5 = select_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_22: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_1, [-1, 448, 64]); cat_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_3: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_22, [0, 2, 1]); view_22 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_2: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_21, permute_3); view_21 = permute_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_23: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_2, [1, 12, 64, 448]); bmm_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:677 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product = second_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_5: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_23, 0.125); view_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:664 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, : 3 * to_block_size],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_30: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 192)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:665 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -to_block_size:],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_34: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -64, 9223372036854775807)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:666 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] full_default: "f32[1, 1, 1, 192][192, 192, 192, 1]cpu" = torch.ops.aten.full.default([1, 1, 1, 192], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:662 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_seq_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_3: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_30, slice_34, full_default], 3); slice_30 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:672 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] full_default_1: "f32[1, 12, 64, 256][196608, 16384, 256, 1]cpu" = torch.ops.aten.full.default([1, 12, 64, 256], 1, dtype = torch.float32, layout = torch.strided, device = device(type='cpu'), pin_memory = False)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_2: "f32[1, 11, 64][832, 64, 1]cpu" = torch.ops.aten.slice.Tensor(arg13_1, 1, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_1: "f32[1, 11, 64, 1][832, 64, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_2, 3); slice_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_2: "f32[1, 11, 64, 1, 1][832, 64, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_1, 4); unsqueeze_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute: "f32[1, 1, 11, 64, 1][832, 1, 64, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_2, [0, 3, 1, 2, 4]); unsqueeze_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select: "f32[13, 64][64, 1]cpu" = torch.ops.aten.select.int(arg13_1, 0, 0); arg13_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_1: "i64[12, 11, 3][33, 3, 1]cpu" = torch.ops.aten.select.int(unsqueeze, 0, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in <listcomp>, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_1: "i64[396][1]cpu" = torch.ops.aten.reshape.default(select_1, [396]); select_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] index: "f32[396, 64][64, 1]cpu" = torch.ops.aten.index.Tensor(select, [view_1]); select = view_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1015 in _create_rand_mask_from_inputs, code: rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_2: "f32[1, 396, 64][25344, 64, 1]cpu" = torch.ops.aten.reshape.default(index, [1, 396, 64]); index = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1016 in _create_rand_mask_from_inputs, code: rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_3: "f32[1, 12, 11, 192][25344, 2112, 192, 1]cpu" = torch.ops.aten.reshape.default(view_2, [1, 12, 11, 192]); view_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:1017 in _create_rand_mask_from_inputs, code: rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_3: "f32[1, 12, 11, 192, 1][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(view_3, 4); view_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_1: "f32[1, 12, 11, 1, 192][25344, 2112, 192, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_3, [0, 1, 2, 4, 3]); unsqueeze_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul: "f32[1, 12, 11, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(permute, permute_1); permute = permute_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:673 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_14: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:670 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_rand_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_4: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_14], 3); select_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:678 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] minimum: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_3, cat_4); cat_3 = cat_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_2: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum); minimum = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_6: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_2, -10000.0); sub_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_5, mul_6); mul_5 = mul_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:679 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] amax_1: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_3, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_3, amax_1); add_3 = amax_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] exp_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_3); sub_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sum_2: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_1, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_1, sum_2); exp_1 = sum_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_12: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_3, torch.bfloat16); div_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_24: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_12, [-1, 64, 448]); convert_element_type_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:604 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_6: "bf16[1, 12, 13, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.reshape.default(arg15_1, [1, 12, 13, 64, -1]); arg15_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:651 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_8: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:652 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_9: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:653 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, 2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_10: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:654 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_11: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:976 in torch_gather_b2, code: flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] clone_3: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(view_6, memory_format = torch.contiguous_format)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_12: "bf16[156, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_3, [156, 64, 64]); clone_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:978 in torch_gather_b2, code: out_flattened = flattened_params.index_select(0, flattened_indices)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] index_2: "bf16[396, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.index.Tensor(view_12, [add]); view_12 = add = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:980 in torch_gather_b2, code: out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_13: "bf16[1, 12, 33, 64, 64][1622016, 135168, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(index_2, [1, 12, 33, 64, 64]); index_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:612 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value = gathered_value.view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_14: "bf16[1, 12, 11, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.reshape.default(view_13, [1, 12, 11, 192, -1]); view_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:655 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, 0],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_12: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, 0)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:649 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_value_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_2: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_9, select_10, select_11, select_12], 2); select_9 = select_10 = select_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_25: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_2, [-1, 448, 64]); cat_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_3: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_24, view_25); view_24 = view_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_26: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_3, [1, 12, 64, 64]); bmm_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:686 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_5: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_26, 2); view_26 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:702 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_57: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_4, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:717 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_6: "bf16[1, 12, 9, 64, 64, 1][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_57, 5)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_6: "bf16[1, 12, 9, 64, 1, 64][638976, 64, 49152, 768, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_6, [0, 1, 2, 3, 5, 4]); unsqueeze_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_8: "bf16[12, 9, 64, 64, 1, 1][64, 49152, 768, 1, 638976, 1]cpu" = torch.ops.aten.permute.default(permute_6, [1, 2, 3, 5, 0, 4]); permute_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_33: "bf16[12, 576, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_8, [12, 576, 64]); permute_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_7: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_3, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_8: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_7, 5); unsqueeze_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_7: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_8, [0, 1, 4, 5, 2, 3]); unsqueeze_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_9: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_7, [1, 5, 0, 4, 2, 3]); permute_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_34: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_9, [12, 64, 64]); permute_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_6: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_34); view_34 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_35: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_6, [12, 9, 64, 1, 1, 64]); bmm_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_10: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_35, [4, 0, 1, 2, 5, 3]); view_35 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_36: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_10, [1, 12, 9, 64, 64]); permute_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:720 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product = first_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_36, 0.125); view_36 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:730 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_68: "f32[1, 1, 1, 64][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, 0, 64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_12: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_68, 3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_5: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_12); unsqueeze_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_12: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_5, -10000.0); sub_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_5: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_9, mul_12); mul_9 = mul_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_24: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_5, torch.bfloat16); add_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] clone_4: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.clone.default(slice_57, memory_format = torch.contiguous_format); slice_57 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_27: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_4, [108, 64, 64]); clone_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:696 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_39: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 1, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_42: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_45: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_5, 2, 3, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:695 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_key_matrix = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_5: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_39, slice_42, slice_45], 3); slice_39 = slice_42 = slice_45 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_28: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_5, [-1, 192, 64]); cat_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_4: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_28, [0, 2, 1]); view_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_4: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_4); permute_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_29: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_4, [1, 12, 9, 64, 192]); bmm_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:708 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product = inner_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_7: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_29, 0.125); view_29 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:729 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: inner_band_product += (1.0 - band_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_4: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, arg17_1); arg17_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_11: "f32[1, 1, 9, 64, 192][110592, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_4, -10000.0); sub_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_4: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_7, mul_11); mul_7 = mul_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_23: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_4, torch.bfloat16); add_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:712 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_60: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_10, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] clone_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_60, memory_format = torch.contiguous_format); slice_60 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_31: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_6, [108, 192, 64]); clone_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_5: "bf16[108, 64, 192][12288, 1, 64]cpu" = torch.ops.aten.permute.default(view_31, [0, 2, 1]); view_31 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_5: "bf16[108, 64, 192][12288, 192, 1]cpu" = torch.ops.aten.bmm.default(view_27, permute_5); view_27 = permute_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_32: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.reshape.default(bmm_5, [1, 12, 9, 64, 192]); bmm_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:714 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product = rand_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_8: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(view_32, 0.125); view_32 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:732 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_75: "f32[1, 12, 9, 64, 192][1622016, 135168, 12288, 192, 1]cpu" = torch.ops.aten.slice.Tensor(mul, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, slice_75); slice_75 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_14: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.mul.Tensor(sub_7, -10000.0); sub_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_7: "f32[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.aten.add.Tensor(mul_8, mul_14); mul_8 = mul_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_26: "bf16[1, 12, 9, 64, 192][1327104, 110592, 12288, 192, 1]cpu" = torch.ops.prims.convert_element_type.default(add_7, torch.bfloat16); add_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:723 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_10: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_6, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_11: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_10, 5); unsqueeze_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_12: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 768, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_11, [0, 1, 4, 5, 2, 3]); unsqueeze_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_14: "bf16[12, 64, 1, 64, 1, 1][64, 1, 638976, 768, 1, 1]cpu" = torch.ops.aten.permute.default(permute_12, [1, 5, 0, 4, 2, 3]); permute_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_38: "bf16[12, 64, 64][64, 1, 768]cpu" = torch.ops.aten.reshape.default(permute_14, [12, 64, 64]); permute_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_7: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_33, view_38); view_33 = view_38 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_39: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_7, [12, 9, 64, 1, 1, 64]); bmm_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_15: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_39, [4, 0, 1, 2, 5, 3]); view_39 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_40: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_15, [1, 12, 9, 64, 64]); permute_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:726 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product = last_band_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_40, 0.125); view_40 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:731 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_13: "f32[1, 1, 1, 1, 64][832, 832, 832, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_34, 3); slice_34 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_6: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, unsqueeze_13); unsqueeze_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_13: "f32[1, 1, 1, 1, 64][64, 64, 64, 64, 1]cpu" = torch.ops.aten.mul.Tensor(sub_6, -10000.0); sub_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_6: "f32[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(mul_10, mul_13); mul_10 = mul_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_25: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.prims.convert_element_type.default(add_6, torch.bfloat16); add_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:735 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: band_product = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_7: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.cat.default([convert_element_type_24, convert_element_type_23, convert_element_type_26, convert_element_type_25], -1); convert_element_type_24 = convert_element_type_23 = convert_element_type_26 = convert_element_type_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:740 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_27: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(cat_7, torch.float32); cat_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] amax_2: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.amax.default(convert_element_type_27, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_8: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.sub.Tensor(convert_element_type_27, amax_2); convert_element_type_27 = amax_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] exp_2: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.exp.default(sub_8); sub_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sum_3: "f32[1, 12, 9, 64, 1][6912, 576, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_2, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div_4: "f32[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.div.Tensor(exp_2, sum_3); exp_2 = sum_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_28: "bf16[1, 12, 9, 64, 512][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.prims.convert_element_type.default(div_4, torch.bfloat16); div_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:747 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_80: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 64, 256)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_41: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_80, [108, 64, 192]); slice_80 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:699 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_48: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 1, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_51: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_54: "bf16[1, 12, 9, 64, 64][638976, 64, 49152, 768, 1]cpu" = torch.ops.aten.slice.Tensor(view_6, 2, 3, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:698 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: exp_blocked_value_matrix = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_6: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.cat.default([slice_48, slice_51, slice_54], 3); slice_48 = slice_51 = slice_54 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_42: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_6, [-1, 192, 64]); cat_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_8: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_41, view_42); view_41 = view_42 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_43: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_8, [1, 12, 9, 64, 64]); bmm_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_85: "bf16[1, 12, 9, 64, 192][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 256, -64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_44: "bf16[108, 64, 192][32768, 512, 1]cpu" = torch.ops.aten.reshape.default(slice_85, [108, 64, 192]); slice_85 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:754 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_88: "bf16[1, 12, 9, 192, 64][1622016, 135168, 12288, 64, 1]cpu" = torch.ops.aten.slice.Tensor(view_14, 2, 1, -1)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] clone_7: "bf16[1, 12, 9, 192, 64][1327104, 110592, 12288, 64, 1]cpu" = torch.ops.aten.clone.default(slice_88, memory_format = torch.contiguous_format); slice_88 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_45: "bf16[108, 192, 64][12288, 64, 1]cpu" = torch.ops.aten.reshape.default(clone_7, [108, 192, 64]); clone_7 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_9: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_44, view_45); view_44 = view_45 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_46: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_9, [1, 12, 9, 64, 64]); bmm_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:753 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += self.torch_bmm_nd(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_8: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_43, view_46); view_43 = view_46 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_47: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_8, [108, 64, 64]); add_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_48: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_47, [1, 12, 9, 64, 64]); view_47 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:760 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_93: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, 0, 64)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:759 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_14: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_93, 5); slice_93 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_16: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_14, [0, 1, 2, 3, 5, 4]); unsqueeze_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_18: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_16, [1, 2, 3, 5, 0, 4]); permute_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_49: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_18, [12, 576, 64]); permute_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_15: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_8, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_16: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_15, 5); unsqueeze_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_17: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_16, [0, 1, 4, 5, 3, 2]); unsqueeze_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_19: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_17, [1, 5, 0, 4, 2, 3]); permute_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_50: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_19, [12, 64, 64]); permute_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_10: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_49, view_50); view_49 = view_50 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_51: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_10, [12, 9, 64, 1, 1, 64]); bmm_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_20: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_51, [4, 0, 1, 2, 5, 3]); view_51 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_52: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_20, [1, 12, 9, 64, 64]); permute_20 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_9: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_48, view_52); view_48 = view_52 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_53: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_9, [108, 64, 64]); add_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_54: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_53, [1, 12, 9, 64, 64]); view_53 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:763 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_100: "bf16[1, 12, 9, 64, 64][3538944, 294912, 32768, 512, 1]cpu" = torch.ops.aten.slice.Tensor(convert_element_type_28, 4, -64, 9223372036854775807); convert_element_type_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:762 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer += torch.einsum(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_17: "bf16[1, 12, 9, 64, 64, 1][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(slice_100, 5); slice_100 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_21: "bf16[1, 12, 9, 64, 1, 64][3538944, 294912, 32768, 512, 1, 1]cpu" = torch.ops.aten.permute.default(unsqueeze_17, [0, 1, 2, 3, 5, 4]); unsqueeze_17 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_23: "bf16[12, 9, 64, 64, 1, 1][294912, 32768, 512, 1, 3538944, 1]cpu" = torch.ops.aten.permute.default(permute_21, [1, 2, 3, 5, 0, 4]); permute_21 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_55: "bf16[12, 576, 64][294912, 512, 1]cpu" = torch.ops.aten.reshape.default(permute_23, [12, 576, 64]); permute_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_18: "bf16[1, 12, 64, 64, 1][638976, 64, 768, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(select_11, 4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_19: "bf16[1, 12, 64, 64, 1, 1][638976, 64, 768, 1, 1, 1]cpu" = torch.ops.aten.unsqueeze.default(unsqueeze_18, 5); unsqueeze_18 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_22: "bf16[1, 12, 1, 1, 64, 64][638976, 64, 1, 1, 1, 768]cpu" = torch.ops.aten.permute.default(unsqueeze_19, [0, 1, 4, 5, 3, 2]); unsqueeze_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_24: "bf16[12, 64, 1, 64, 1, 1][64, 768, 638976, 1, 1, 1]cpu" = torch.ops.aten.permute.default(permute_22, [1, 5, 0, 4, 2, 3]); permute_22 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_56: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(permute_24, [12, 64, 64]); permute_24 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_11: "bf16[12, 576, 64][36864, 64, 1]cpu" = torch.ops.aten.bmm.default(view_55, view_56); view_55 = view_56 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_57: "bf16[12, 9, 64, 1, 1, 64][36864, 4096, 64, 64, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_11, [12, 9, 64, 1, 1, 64]); bmm_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_25: "bf16[1, 12, 9, 64, 64, 1][64, 36864, 4096, 64, 1, 64]cpu" = torch.ops.aten.permute.default(view_57, [4, 0, 1, 2, 5, 3]); view_57 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_58: "bf16[1, 12, 9, 64, 64][64, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(permute_25, [1, 12, 9, 64, 64]); permute_25 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_10: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.add.Tensor(view_54, view_58); view_54 = view_58 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_59: "bf16[108, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.reshape.default(add_10, [108, 64, 64]); add_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_60: "bf16[1, 12, 9, 64, 64][442368, 36864, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(view_59, [1, 12, 9, 64, 64]); view_59 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:795 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_29: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_61: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_29, [12, 64, 64]); select_29 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:776 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -3],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_20: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:777 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_key_matrix[:, :, -2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_21: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_5, 2, -2); view_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:779 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_key[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_23: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_10, 2, -1); view_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:773 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_key_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_8: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_3, select_20, select_21, select_6, select_23], 2); select_3 = select_20 = select_21 = select_6 = select_23 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_62: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_8, [-1, 448, 64]); cat_8 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_26: "bf16[12, 64, 448][28672, 1, 64]cpu" = torch.ops.aten.permute.default(view_62, [0, 2, 1]); view_62 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_12: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.bmm.default(view_61, permute_26); view_61 = permute_26 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_63: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.reshape.default(bmm_12, [1, 12, 64, 448]); bmm_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:811 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product = second_last_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_15: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(view_63, 0.125); view_63 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:799 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: to_mask[:, :, :, -3 * to_block_size :],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] slice_132: "f32[1, 1, 1, 192][832, 832, 832, 1]cpu" = torch.ops.aten.slice.Tensor(arg16_1, 3, -192, 9223372036854775807); arg16_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:796 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_seq_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_10: "f32[1, 1, 1, 448][448, 448, 448, 1]cpu" = torch.ops.aten.cat.default([slice_68, slice_132, full_default], 3); slice_68 = slice_132 = full_default = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:807 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: rand_mask[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_30: "f32[1, 12, 64, 192][1622016, 135168, 192, 1]cpu" = torch.ops.aten.select.int(mul, 2, -1); mul = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:804 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_rand_pad = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.cat.default([full_default_1, select_30], 3); full_default_1 = select_30 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:812 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] minimum_1: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.minimum.default(cat_10, cat_11); cat_10 = cat_11 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_9: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(1.0, minimum_1); minimum_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_16: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.mul.Tensor(sub_9, -10000.0); sub_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_11: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.add.Tensor(mul_15, mul_16); mul_15 = mul_16 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:813 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_attn_weights = nn.functional.softmax(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] amax_3: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_11, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_10: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.sub.Tensor(add_11, amax_3); add_11 = amax_3 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] exp_3: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.exp.default(sub_10); sub_10 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sum_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_3, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div_5: "f32[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.aten.div.Tensor(exp_3, sum_4); exp_3 = sum_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_41: "bf16[1, 12, 64, 448][344064, 28672, 448, 1]cpu" = torch.ops.prims.convert_element_type.default(div_5, torch.bfloat16); div_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_64: "bf16[12, 64, 448][28672, 448, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_41, [-1, 64, 448]); convert_element_type_41 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:786 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -3],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_25: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -3)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:787 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: blocked_value_matrix[:, :, -2],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_26: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_6, 2, -2); view_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:789 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: gathered_value[:, :, -1],
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_28: "bf16[1, 12, 192, 64][1622016, 135168, 64, 1]cpu" = torch.ops.aten.select.int(view_14, 2, -1); view_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:783 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_value_mat = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_9: "bf16[1, 12, 448, 64][344064, 28672, 64, 1]cpu" = torch.ops.aten.cat.default([select_8, select_25, select_26, select_11, select_28], 2); select_8 = select_25 = select_26 = select_11 = select_28 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_65: "bf16[12, 448, 64][28672, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_9, [-1, 448, 64]); cat_9 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_13: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_64, view_65); view_64 = view_65 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_66: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_13, [1, 12, 64, 64]); bmm_13 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:819 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: second_last_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_20: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_66, 2); view_66 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:826 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] select_31: "bf16[1, 12, 64, 64][638976, 64, 768, 1]cpu" = torch.ops.aten.select.int(view_4, 2, -1); view_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:513 in torch_bmm_nd_transpose, code: inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_67: "bf16[12, 64, 64][64, 768, 1]cpu" = torch.ops.aten.reshape.default(select_31, [12, 64, 64]); select_31 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:512 in torch_bmm_nd_transpose, code: return torch.bmm(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_14: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.bmm.default(view_67, permute_2); view_67 = permute_2 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:514 in torch_bmm_nd_transpose, code: ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_69: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.reshape.default(bmm_14, [1, 12, 64, 832]); bmm_14 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:827 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product = last_product * rsqrt_d
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_17: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.mul.Tensor(view_69, 0.125); view_69 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:828 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_product += (1.0 - to_mask) * attn_mask_penalty
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] add_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.add.Tensor(mul_17, mul_4); mul_17 = mul_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:829 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] amax_4: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.amax.default(add_12, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sub_12: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.sub.Tensor(add_12, amax_4); add_12 = amax_4 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] exp_4: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.exp.default(sub_12); sub_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] sum_5: "f32[1, 12, 64, 1][768, 64, 1, 1]cpu" = torch.ops.aten.sum.dim_IntList(exp_4, [-1], True)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] div_6: "f32[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.aten.div.Tensor(exp_4, sum_5); exp_4 = sum_5 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] convert_element_type_48: "bf16[1, 12, 64, 832][638976, 53248, 832, 1]cpu" = torch.ops.prims.convert_element_type.default(div_6, torch.bfloat16); div_6 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:504 in torch_bmm_nd, code: return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_70: "bf16[12, 64, 832][53248, 832, 1]cpu" = torch.ops.aten.reshape.default(convert_element_type_48, [-1, 64, 832]); convert_element_type_48 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] bmm_15: "bf16[12, 64, 64][4096, 64, 1]cpu" = torch.ops.aten.bmm.default(view_70, view_19); view_70 = view_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_72: "bf16[1, 12, 64, 64][49152, 4096, 64, 1]cpu" = torch.ops.aten.reshape.default(bmm_15, [1, 12, 64, 64]); bmm_15 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:833 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: last_context_layer.unsqueeze_(2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] unsqueeze_21: "bf16[1, 12, 1, 64, 64][49152, 4096, 4096, 64, 1]cpu" = torch.ops.aten.unsqueeze.default(view_72, 2); view_72 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:836 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.cat(
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] cat_12: "bf16[1, 12, 13, 64, 64][638976, 53248, 4096, 64, 1]cpu" = torch.ops.aten.cat.default([unsqueeze_4, unsqueeze_5, view_60, unsqueeze_20, unsqueeze_21], 2); unsqueeze_4 = unsqueeze_5 = view_60 = unsqueeze_20 = unsqueeze_21 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:840 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] view_73: "bf16[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.reshape.default(cat_12, [1, 12, 832, -1]); cat_12 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] mul_19: "f32[1, 12, 832, 64][638976, 53248, 64, 1]cpu" = torch.ops.aten.mul.Tensor(view_73, arg18_1); view_73 = arg18_1 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] # File: /localdisk/leslie/miniconda/envs/pytorch_community/lib/python3.10/site-packages/transformers/models/big_bird/modeling_big_bird.py:841 in torch_dynamo_resume_in_bigbird_block_sparse_attention_at_583, code: context_layer = torch.transpose(context_layer, 1, 2)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] permute_28: "f32[1, 832, 12, 64][638976, 64, 53248, 1]cpu" = torch.ops.aten.permute.default(mul_19, [0, 2, 1, 3]); mul_19 = None
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs] return (permute_28, unsqueeze)
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.458000 140616046391680 torch/_inductor/compile_fx.py:748] [15/0] [__post_grad_graphs]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg0_1 : [num_users=1] = placeholder[target=arg0_1]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg1_1 : [num_users=1] = placeholder[target=arg1_1]
V0614 00:48:38.471000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg2_1 : [num_users=1] = placeholder[target=arg2_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg3_1 : [num_users=1] = placeholder[target=arg3_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg4_1 : [num_users=1] = placeholder[target=arg4_1]
V0614 00:48:38.472000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg5_1 : [num_users=1] = placeholder[target=arg5_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg6_1 : [num_users=1] = placeholder[target=arg6_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg7_1 : [num_users=1] = placeholder[target=arg7_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg8_1 : [num_users=1] = placeholder[target=arg8_1]
V0614 00:48:38.473000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg9_1 : [num_users=1] = placeholder[target=arg9_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg10_1 : [num_users=1] = placeholder[target=arg10_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg11_1 : [num_users=1] = placeholder[target=arg11_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg12_1 : [num_users=1] = placeholder[target=arg12_1]
V0614 00:48:38.474000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg13_1 : [num_users=2] = placeholder[target=arg13_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg14_1 : [num_users=2] = placeholder[target=arg14_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg15_1 : [num_users=2] = placeholder[target=arg15_1]
V0614 00:48:38.475000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg16_1 : [num_users=5] = placeholder[target=arg16_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg17_1 : [num_users=1] = placeholder[target=arg17_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %arg18_1 : [num_users=1] = placeholder[target=arg18_1]
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_4 : [num_users=5] = call_function[target=torch.ops.aten.reshape.default](args = (%arg12_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.476000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.477000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_2 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, 0), kwargs = {})
V0614 00:48:38.478000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_15 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_2, [12, 64, 64]), kwargs = {})
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_16 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%arg14_1, [12, 832, 64]), kwargs = {})
V0614 00:48:38.479000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_2 : [num_users=2] = call_function[target=torch.ops.aten.permute.default](args = (%view_16, [0, 2, 1]), kwargs = {})
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_15, %permute_2), kwargs = {})
V0614 00:48:38.480000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.482000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.483000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_17 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm, [1, 12, 64, 832]), kwargs = {})
V0614 00:48:38.483000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.484000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_3 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_17, 0.125), kwargs = {})
V0614 00:48:38.484000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.485000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %arg16_1), kwargs = {})
V0614 00:48:38.485000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.486000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_4 : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub, -10000.0), kwargs = {})
V0614 00:48:38.486000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.487000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_2 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_3, %mul_4), kwargs = {})
V0614 00:48:38.487000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.488000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_2, [-1], True), kwargs = {})
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_2, %amax), kwargs = {})
V0614 00:48:38.489000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.492000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_1,), kwargs = {})
V0614 00:48:38.492000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.494000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp, [-1], True), kwargs = {})
V0614 00:48:38.494000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.495000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_2 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp, %sum_1), kwargs = {})
V0614 00:48:38.495000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div at 0x7fe2165656c0>
V0614 00:48:38.497000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_5 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_2, torch.bfloat16), kwargs = {})
V0614 00:48:38.497000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_18 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_5, [-1, 64, 832]), kwargs = {})
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.498000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_19 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%arg15_1, [12, 832, 64]), kwargs = {})
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_1 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_18, %view_19), kwargs = {})
V0614 00:48:38.499000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.500000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_20 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_1, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_4 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_20, 2), kwargs = {})
V0614 00:48:38.501000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.502000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_13 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, 1), kwargs = {})
V0614 00:48:38.502000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_21 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_13, [12, 64, 64]), kwargs = {})
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_5 : [num_users=10] = call_function[target=torch.ops.aten.reshape.default](args = (%arg14_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.503000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.504000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_3 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 0), kwargs = {})
V0614 00:48:38.504000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.505000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_4 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 1), kwargs = {})
V0614 00:48:38.505000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.506000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_5 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, 2), kwargs = {})
V0614 00:48:38.506000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.507000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_6 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -1), kwargs = {})
V0614 00:48:38.507000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.508000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_2 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%view_5,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.508000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function clone at 0x7fe2179aac20>
V0614 00:48:38.510000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_8 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_2, [156, 64, 64]), kwargs = {})
V0614 00:48:38.510000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.511000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%arg0_1, %arg1_1, %arg2_1, %arg3_1, %arg4_1, %arg5_1, %arg6_1, %arg7_1, %arg8_1, %arg9_1, %arg10_1, %arg11_1],), kwargs = {})
V0614 00:48:38.511000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.514000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat, [12, 11, 3]), kwargs = {})
V0614 00:48:38.514000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.515000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%view, torch.int64), kwargs = {})
V0614 00:48:38.515000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.516000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze : [num_users=3] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%convert_element_type, 0), kwargs = {})
V0614 00:48:38.516000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.517000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_7 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%unsqueeze, [-1]), kwargs = {})
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %iota : [num_users=1] = call_function[target=torch.ops.prims.iota.default](args = (396,), kwargs = {start: 0, step: 1, dtype: torch.int64, device: cpu, requires_grad: False})
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function iota at 0x7fe2179aae60>
V0614 00:48:38.518000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor_mode](args = (%iota, 33), kwargs = {rounding_mode: floor})
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div_mode at 0x7fe2165652d0>
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%div, 13), kwargs = {})
V0614 00:48:38.519000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.520000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_7, %mul_1), kwargs = {})
V0614 00:48:38.520000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.521000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index_1 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_8, [%add]), kwargs = {})
V0614 00:48:38.521000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function index at 0x7fe21799ca60>
V0614 00:48:38.523000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_9 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_1, [1, 12, 33, 64, 64]), kwargs = {})
V0614 00:48:38.524000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.525000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_10 : [num_users=3] = call_function[target=torch.ops.aten.reshape.default](args = (%view_9, [1, 12, 11, 192, -1]), kwargs = {})
V0614 00:48:38.525000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.534000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_7 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_10, 2, 0), kwargs = {})
V0614 00:48:38.534000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.540000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_1 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_3, %select_4, %select_5, %select_6, %select_7], 2), kwargs = {})
V0614 00:48:38.540000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.541000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_22 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_1, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_3 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_22, [0, 2, 1]), kwargs = {})
V0614 00:48:38.542000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.543000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_2 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_21, %permute_3), kwargs = {})
V0614 00:48:38.543000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.544000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.544000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_23 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_2, [1, 12, 64, 448]), kwargs = {})
V0614 00:48:38.544000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.545000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_5 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_23, 0.125), kwargs = {})
V0614 00:48:38.545000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.546000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_30 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, 0, 192), kwargs = {})
V0614 00:48:38.546000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.547000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_34 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, -64, 9223372036854775807), kwargs = {})
V0614 00:48:38.547000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %full_default : [num_users=2] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 1, 192], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function full at 0x7fe21799c550>
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_3 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_30, %slice_34, %full_default], 3), kwargs = {})
V0614 00:48:38.548000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.549000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %full_default_1 : [num_users=2] = call_function[target=torch.ops.aten.full.default](args = ([1, 12, 64, 256], 1), kwargs = {dtype: torch.float32, layout: torch.strided, device: cpu, pin_memory: False})
V0614 00:48:38.549000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function full at 0x7fe21799c550>
V0614 00:48:38.550000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_2 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg13_1, 1, 1, -1), kwargs = {})
V0614 00:48:38.550000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_1 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_2, 3), kwargs = {})
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_2 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_1, 4), kwargs = {})
V0614 00:48:38.551000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_2, [0, 3, 1, 2, 4]), kwargs = {})
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%arg13_1, 0, 0), kwargs = {})
V0614 00:48:38.552000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.553000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_1 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%unsqueeze, 0, 0), kwargs = {})
V0614 00:48:38.553000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_1 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_1, [396]), kwargs = {})
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%select, [%view_1]), kwargs = {})
V0614 00:48:38.554000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function index at 0x7fe21799ca60>
V0614 00:48:38.555000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_2 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index, [1, 396, 64]), kwargs = {})
V0614 00:48:38.555000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.556000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_3 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_2, [1, 12, 11, 192]), kwargs = {})
V0614 00:48:38.556000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_3 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_3, 4), kwargs = {})
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.559000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_1 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_3, [0, 1, 2, 4, 3]), kwargs = {})
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul : [num_users=3] = call_function[target=torch.ops.aten.mul.Tensor](args = (%permute, %permute_1), kwargs = {})
V0614 00:48:38.560000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.566000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_14 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%mul, 2, 0), kwargs = {})
V0614 00:48:38.566000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.568000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_4 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%full_default_1, %select_14], 3), kwargs = {})
V0614 00:48:38.568000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.569000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %minimum : [num_users=1] = call_function[target=torch.ops.aten.minimum.default](args = (%cat_3, %cat_4), kwargs = {})
V0614 00:48:38.570000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216577130>
V0614 00:48:38.571000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %minimum), kwargs = {})
V0614 00:48:38.571000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.572000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_6 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_2, -10000.0), kwargs = {})
V0614 00:48:38.572000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.573000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_3 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_5, %mul_6), kwargs = {})
V0614 00:48:38.573000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.575000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_1 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_3, [-1], True), kwargs = {})
V0614 00:48:38.575000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.576000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_3 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_3, %amax_1), kwargs = {})
V0614 00:48:38.576000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.578000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_1 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_3,), kwargs = {})
V0614 00:48:38.579000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.581000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_1, [-1], True), kwargs = {})
V0614 00:48:38.581000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.582000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_3 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_1, %sum_2), kwargs = {})
V0614 00:48:38.582000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div at 0x7fe2165656c0>
V0614 00:48:38.584000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_12 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_3, torch.bfloat16), kwargs = {})
V0614 00:48:38.584000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_24 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_12, [-1, 64, 448]), kwargs = {})
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_6 : [num_users=10] = call_function[target=torch.ops.aten.reshape.default](args = (%arg15_1, [1, 12, 13, 64, -1]), kwargs = {})
V0614 00:48:38.585000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.586000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_8 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 0), kwargs = {})
V0614 00:48:38.586000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.587000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_9 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 1), kwargs = {})
V0614 00:48:38.587000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.588000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_10 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, 2), kwargs = {})
V0614 00:48:38.588000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.589000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_11 : [num_users=3] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -1), kwargs = {})
V0614 00:48:38.589000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.590000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_3 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%view_6,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.590000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function clone at 0x7fe2179aac20>
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_12 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_3, [156, 64, 64]), kwargs = {})
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.591000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %index_2 : [num_users=1] = call_function[target=torch.ops.aten.index.Tensor](args = (%view_12, [%add]), kwargs = {})
V0614 00:48:38.592000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function index at 0x7fe21799ca60>
V0614 00:48:38.593000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_13 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%index_2, [1, 12, 33, 64, 64]), kwargs = {})
V0614 00:48:38.593000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.594000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_14 : [num_users=3] = call_function[target=torch.ops.aten.reshape.default](args = (%view_13, [1, 12, 11, 192, -1]), kwargs = {})
V0614 00:48:38.594000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.595000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_12 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_14, 2, 0), kwargs = {})
V0614 00:48:38.595000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.597000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_2 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_8, %select_9, %select_10, %select_11, %select_12], 2), kwargs = {})
V0614 00:48:38.597000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.598000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_25 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_2, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.598000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.599000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_3 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_24, %view_25), kwargs = {})
V0614 00:48:38.599000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.600000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_26 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_3, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_5 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_26, 2), kwargs = {})
V0614 00:48:38.600000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.601000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_57 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_4, 2, 2, -2), kwargs = {})
V0614 00:48:38.601000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.602000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_6 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_57, 5), kwargs = {})
V0614 00:48:38.602000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_6 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_6, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_8 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_6, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.603000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_33 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_8, [12, 576, 64]), kwargs = {})
V0614 00:48:38.604000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.605000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_7 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_3, 4), kwargs = {})
V0614 00:48:38.605000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_8 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_7, 5), kwargs = {})
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_7 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_8, [0, 1, 4, 5, 2, 3]), kwargs = {})
V0614 00:48:38.606000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_9 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_7, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_34 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_9, [12, 64, 64]), kwargs = {})
V0614 00:48:38.607000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.608000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_6 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_33, %view_34), kwargs = {})
V0614 00:48:38.608000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.620000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.620000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_35 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_6, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.620000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_10 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_35, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_36 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_10, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.621000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.622000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_9 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_36, 0.125), kwargs = {})
V0614 00:48:38.622000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.623000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_68 : [num_users=2] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, 0, 64), kwargs = {})
V0614 00:48:38.623000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_12 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_68, 3), kwargs = {})
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_5 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %unsqueeze_12), kwargs = {})
V0614 00:48:38.624000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.625000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_12 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_5, -10000.0), kwargs = {})
V0614 00:48:38.625000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.626000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_5 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_9, %mul_12), kwargs = {})
V0614 00:48:38.626000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.627000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_24 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_5, torch.bfloat16), kwargs = {})
V0614 00:48:38.627000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.628000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_4 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_57,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.628000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function clone at 0x7fe2179aac20>
V0614 00:48:38.630000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_27 : [num_users=2] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_4, [108, 64, 64]), kwargs = {})
V0614 00:48:38.630000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.631000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_39 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 1, -3), kwargs = {})
V0614 00:48:38.631000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.632000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_42 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 2, -2), kwargs = {})
V0614 00:48:38.632000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.633000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_45 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_5, 2, 3, -1), kwargs = {})
V0614 00:48:38.633000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.634000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_5 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_39, %slice_42, %slice_45], 3), kwargs = {})
V0614 00:48:38.634000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.635000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_28 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_5, [-1, 192, 64]), kwargs = {})
V0614 00:48:38.635000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_4 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_28, [0, 2, 1]), kwargs = {})
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_4 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_27, %permute_4), kwargs = {})
V0614 00:48:38.636000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.646000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.646000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_29 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_4, [1, 12, 9, 64, 192]), kwargs = {})
V0614 00:48:38.646000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.647000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_7 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_29, 0.125), kwargs = {})
V0614 00:48:38.647000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.648000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_4 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %arg17_1), kwargs = {})
V0614 00:48:38.648000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.649000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_11 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_4, -10000.0), kwargs = {})
V0614 00:48:38.649000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.650000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_4 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_7, %mul_11), kwargs = {})
V0614 00:48:38.650000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.652000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_23 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_4, torch.bfloat16), kwargs = {})
V0614 00:48:38.652000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.654000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_60 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_10, 2, 1, -1), kwargs = {})
V0614 00:48:38.654000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.660000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_6 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_60,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.660000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function clone at 0x7fe2179aac20>
V0614 00:48:38.669000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_31 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_6, [108, 192, 64]), kwargs = {})
V0614 00:48:38.670000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.677000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_5 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_31, [0, 2, 1]), kwargs = {})
V0614 00:48:38.677000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.684000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_5 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_27, %permute_5), kwargs = {})
V0614 00:48:38.684000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.693000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_32 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_5, [1, 12, 9, 64, 192]), kwargs = {})
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_8 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_32, 0.125), kwargs = {})
V0614 00:48:38.693000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.694000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_75 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%mul, 2, 1, -1), kwargs = {})
V0614 00:48:38.694000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.697000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_7 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %slice_75), kwargs = {})
V0614 00:48:38.697000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.701000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_7, -10000.0), kwargs = {})
V0614 00:48:38.701000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.702000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_7 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_8, %mul_14), kwargs = {})
V0614 00:48:38.702000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.705000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_26 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_7, torch.bfloat16), kwargs = {})
V0614 00:48:38.705000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.706000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_10 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_6, 4), kwargs = {})
V0614 00:48:38.706000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.707000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_11 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_10, 5), kwargs = {})
V0614 00:48:38.707000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_12 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_11, [0, 1, 4, 5, 2, 3]), kwargs = {})
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_14 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_12, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.708000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.709000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_38 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_14, [12, 64, 64]), kwargs = {})
V0614 00:48:38.709000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.710000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_7 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_33, %view_38), kwargs = {})
V0614 00:48:38.710000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.711000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_39 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_7, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.711000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_15 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_39, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_40 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_15, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_10 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_40, 0.125), kwargs = {})
V0614 00:48:38.712000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.713000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_13 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_34, 3), kwargs = {})
V0614 00:48:38.713000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.714000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_6 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %unsqueeze_13), kwargs = {})
V0614 00:48:38.714000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_6, -10000.0), kwargs = {})
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.715000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_6 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_10, %mul_13), kwargs = {})
V0614 00:48:38.716000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.717000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_25 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_6, torch.bfloat16), kwargs = {})
V0614 00:48:38.717000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.718000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_7 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%convert_element_type_24, %convert_element_type_23, %convert_element_type_26, %convert_element_type_25], -1), kwargs = {})
V0614 00:48:38.718000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.720000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_27 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat_7, torch.float32), kwargs = {})
V0614 00:48:38.721000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.722000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_2 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%convert_element_type_27, [-1], True), kwargs = {})
V0614 00:48:38.722000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.724000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_8 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%convert_element_type_27, %amax_2), kwargs = {})
V0614 00:48:38.724000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.727000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_2 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_8,), kwargs = {})
V0614 00:48:38.727000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.728000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_2, [-1], True), kwargs = {})
V0614 00:48:38.728000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.729000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_4 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_2, %sum_3), kwargs = {})
V0614 00:48:38.729000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div at 0x7fe2165656c0>
V0614 00:48:38.731000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_28 : [num_users=4] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_4, torch.bfloat16), kwargs = {})
V0614 00:48:38.731000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.732000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_80 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 64, 256), kwargs = {})
V0614 00:48:38.732000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.733000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_41 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%slice_80, [108, 64, 192]), kwargs = {})
V0614 00:48:38.734000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.735000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_48 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 1, -3), kwargs = {})
V0614 00:48:38.735000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_51 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 2, -2), kwargs = {})
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.736000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_54 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_6, 2, 3, -1), kwargs = {})
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_6 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_48, %slice_51, %slice_54], 3), kwargs = {})
V0614 00:48:38.737000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.738000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_42 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_6, [-1, 192, 64]), kwargs = {})
V0614 00:48:38.738000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.739000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_8 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_41, %view_42), kwargs = {})
V0614 00:48:38.739000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.748000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_43 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_8, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_85 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 256, -64), kwargs = {})
V0614 00:48:38.748000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.749000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_44 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%slice_85, [108, 64, 192]), kwargs = {})
V0614 00:48:38.749000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.750000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_88 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%view_14, 2, 1, -1), kwargs = {})
V0614 00:48:38.750000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.751000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %clone_7 : [num_users=1] = call_function[target=torch.ops.aten.clone.default](args = (%slice_88,), kwargs = {memory_format: torch.contiguous_format})
V0614 00:48:38.751000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function clone at 0x7fe2179aac20>
V0614 00:48:38.754000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_45 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%clone_7, [108, 192, 64]), kwargs = {})
V0614 00:48:38.754000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.755000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_9 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_44, %view_45), kwargs = {})
V0614 00:48:38.755000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.767000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.767000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_46 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_9, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.767000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.768000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_43, %view_46), kwargs = {})
V0614 00:48:38.768000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.769000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_47 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_8, [108, 64, 64]), kwargs = {})
V0614 00:48:38.769000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.770000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_48 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_47, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.770000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.772000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_93 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, 0, 64), kwargs = {})
V0614 00:48:38.772000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.773000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_14 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_93, 5), kwargs = {})
V0614 00:48:38.773000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_16 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_14, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_18 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_16, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_49 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_18, [12, 576, 64]), kwargs = {})
V0614 00:48:38.774000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.775000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_15 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_8, 4), kwargs = {})
V0614 00:48:38.775000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.776000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_16 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_15, 5), kwargs = {})
V0614 00:48:38.776000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_17 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_16, [0, 1, 4, 5, 3, 2]), kwargs = {})
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_19 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_17, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.777000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_50 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_19, [12, 64, 64]), kwargs = {})
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_10 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_49, %view_50), kwargs = {})
V0614 00:48:38.778000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.785000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.785000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_51 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_10, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.785000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_20 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_51, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_52 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_20, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.786000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_9 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_48, %view_52), kwargs = {})
V0614 00:48:38.787000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.790000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_53 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_9, [108, 64, 64]), kwargs = {})
V0614 00:48:38.790000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.792000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_54 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_53, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.792000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.795000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_100 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%convert_element_type_28, 4, -64, 9223372036854775807), kwargs = {})
V0614 00:48:38.795000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.796000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_17 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%slice_100, 5), kwargs = {})
V0614 00:48:38.796000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_21 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_17, [0, 1, 2, 3, 5, 4]), kwargs = {})
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_23 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_21, [1, 2, 3, 5, 0, 4]), kwargs = {})
V0614 00:48:38.797000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_55 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_23, [12, 576, 64]), kwargs = {})
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_18 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%select_11, 4), kwargs = {})
V0614 00:48:38.798000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.799000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_19 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%unsqueeze_18, 5), kwargs = {})
V0614 00:48:38.799000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_22 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%unsqueeze_19, [0, 1, 4, 5, 3, 2]), kwargs = {})
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_24 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%permute_22, [1, 5, 0, 4, 2, 3]), kwargs = {})
V0614 00:48:38.800000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_56 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_24, [12, 64, 64]), kwargs = {})
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_11 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_55, %view_56), kwargs = {})
V0614 00:48:38.801000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.808000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.808000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_57 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_11, [12, 9, 64, 1, 1, 64]), kwargs = {})
V0614 00:48:38.808000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_25 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_57, [4, 0, 1, 2, 5, 3]), kwargs = {})
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_58 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%permute_25, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.809000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_10 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%view_54, %view_58), kwargs = {})
V0614 00:48:38.810000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.814000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_59 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%add_10, [108, 64, 64]), kwargs = {})
V0614 00:48:38.814000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.818000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_60 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%view_59, [1, 12, 9, 64, 64]), kwargs = {})
V0614 00:48:38.818000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.822000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_29 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, -2), kwargs = {})
V0614 00:48:38.822000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_61 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_29, [12, 64, 64]), kwargs = {})
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_20 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -3), kwargs = {})
V0614 00:48:38.824000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.825000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_21 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_5, 2, -2), kwargs = {})
V0614 00:48:38.826000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.826000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_23 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_10, 2, -1), kwargs = {})
V0614 00:48:38.827000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.832000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_8 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_3, %select_20, %select_21, %select_6, %select_23], 2), kwargs = {})
V0614 00:48:38.832000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.833000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_62 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_8, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.833000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_26 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%view_62, [0, 2, 1]), kwargs = {})
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_12 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_61, %permute_26), kwargs = {})
V0614 00:48:38.834000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.835000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.835000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_63 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_12, [1, 12, 64, 448]), kwargs = {})
V0614 00:48:38.835000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.836000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_63, 0.125), kwargs = {})
V0614 00:48:38.836000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.837000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %slice_132 : [num_users=1] = call_function[target=torch.ops.aten.slice.Tensor](args = (%arg16_1, 3, -192, 9223372036854775807), kwargs = {})
V0614 00:48:38.837000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function slice_ at 0x7fe2179c3490>
V0614 00:48:38.838000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_10 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%slice_68, %slice_132, %full_default], 3), kwargs = {})
V0614 00:48:38.838000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.839000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_30 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%mul, 2, -1), kwargs = {})
V0614 00:48:38.839000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.842000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_11 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%full_default_1, %select_30], 3), kwargs = {})
V0614 00:48:38.842000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.843000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %minimum_1 : [num_users=1] = call_function[target=torch.ops.aten.minimum.default](args = (%cat_10, %cat_11), kwargs = {})
V0614 00:48:38.843000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216577130>
V0614 00:48:38.844000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_9 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (1.0, %minimum_1), kwargs = {})
V0614 00:48:38.844000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.845000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_16 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%sub_9, -10000.0), kwargs = {})
V0614 00:48:38.845000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.846000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_11 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_15, %mul_16), kwargs = {})
V0614 00:48:38.846000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.848000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_3 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_11, [-1], True), kwargs = {})
V0614 00:48:38.848000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.849000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_10 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_11, %amax_3), kwargs = {})
V0614 00:48:38.849000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.852000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_3 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_10,), kwargs = {})
V0614 00:48:38.852000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.854000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_4 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_3, [-1], True), kwargs = {})
V0614 00:48:38.854000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.855000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_5 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_3, %sum_4), kwargs = {})
V0614 00:48:38.855000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div at 0x7fe2165656c0>
V0614 00:48:38.857000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_41 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_5, torch.bfloat16), kwargs = {})
V0614 00:48:38.857000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.858000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_64 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_41, [-1, 64, 448]), kwargs = {})
V0614 00:48:38.858000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.859000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_25 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -3), kwargs = {})
V0614 00:48:38.859000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.860000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_26 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_6, 2, -2), kwargs = {})
V0614 00:48:38.860000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.861000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_28 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_14, 2, -1), kwargs = {})
V0614 00:48:38.861000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.862000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_9 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%select_8, %select_25, %select_26, %select_11, %select_28], 2), kwargs = {})
V0614 00:48:38.862000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.863000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_65 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_9, [-1, 448, 64]), kwargs = {})
V0614 00:48:38.863000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_13 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_64, %view_65), kwargs = {})
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.864000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.864000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_66 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_13, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_20 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_66, 2), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %select_31 : [num_users=1] = call_function[target=torch.ops.aten.select.int](args = (%view_4, 2, -1), kwargs = {})
V0614 00:48:38.865000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function select at 0x7fe2179c4550>
V0614 00:48:38.866000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_67 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%select_31, [12, 64, 64]), kwargs = {})
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_14 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_67, %permute_2), kwargs = {})
V0614 00:48:38.867000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.868000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.868000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_69 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_14, [1, 12, 64, 832]), kwargs = {})
V0614 00:48:38.868000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.869000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_69, 0.125), kwargs = {})
V0614 00:48:38.869000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.870000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %add_12 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_17, %mul_4), kwargs = {})
V0614 00:48:38.870000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216566ef0>
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %amax_4 : [num_users=1] = call_function[target=torch.ops.aten.amax.default](args = (%add_12, [-1], True), kwargs = {})
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_reduction.<locals>.inner at 0x7fe216566950>
V0614 00:48:38.872000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sub_12 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%add_12, %amax_4), kwargs = {})
V0614 00:48:38.873000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe216574280>
V0614 00:48:38.875000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %exp_4 : [num_users=2] = call_function[target=torch.ops.aten.exp.default](args = (%sub_12,), kwargs = {})
V0614 00:48:38.875000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function make_pointwise.<locals>.inner at 0x7fe2165672e0>
V0614 00:48:38.876000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %sum_5 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp_4, [-1], True), kwargs = {})
V0614 00:48:38.877000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function sum_ at 0x7fe216565a20>
V0614 00:48:38.878000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %div_6 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%exp_4, %sum_5), kwargs = {})
V0614 00:48:38.878000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function div at 0x7fe2165656c0>
V0614 00:48:38.879000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %convert_element_type_48 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%div_6, torch.bfloat16), kwargs = {})
V0614 00:48:38.879000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function _convert_element_type at 0x7fe2179c13f0>
V0614 00:48:38.880000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_70 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%convert_element_type_48, [-1, 64, 832]), kwargs = {})
V0614 00:48:38.880000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.881000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %bmm_15 : [num_users=1] = call_function[target=torch.ops.aten.bmm.default](args = (%view_70, %view_19), kwargs = {})
V0614 00:48:38.881000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function tuned_bmm at 0x7fe1849af250>
V0614 00:48:38.882000 140616046391680 torch/_inductor/select_algorithm.py:1140] [15/0] Max autotune selects from 1 choices.
V0614 00:48:38.882000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_72 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%bmm_15, [1, 12, 64, 64]), kwargs = {})
V0614 00:48:38.882000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %unsqueeze_21 : [num_users=1] = call_function[target=torch.ops.aten.unsqueeze.default](args = (%view_72, 2), kwargs = {})
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function unsqueeze at 0x7fe2179c4af0>
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %cat_12 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%unsqueeze_4, %unsqueeze_5, %view_60, %unsqueeze_20, %unsqueeze_21], 2), kwargs = {})
V0614 00:48:38.883000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function cat at 0x7fe2179c40d0>
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %view_73 : [num_users=1] = call_function[target=torch.ops.aten.reshape.default](args = (%cat_12, [1, 12, 832, -1]), kwargs = {})
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function view at 0x7fe2179c3130>
V0614 00:48:38.885000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %mul_19 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%view_73, %arg18_1), kwargs = {})
V0614 00:48:38.886000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function mul at 0x7fe2165653f0>
V0614 00:48:38.887000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering %permute_28 : [num_users=1] = call_function[target=torch.ops.aten.permute.default](args = (%mul_19, [0, 2, 1, 3]), kwargs = {})
V0614 00:48:38.887000 140616046391680 torch/_inductor/graph.py:976] [15/0] via <function permute at 0x7fe2179c3370>
V0614 00:48:38.889000 140616046391680 torch/_inductor/graph.py:1173] [15/0] lowering return (permute_28, unsqueeze)
V0614 00:48:38.894000 140616046391680 torch/_inductor/graph.py:1097] [15/0] Force channels last inputs for 0 conv for the current graph with id 8
W0614 00:48:39.040000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] indirect0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:39.043000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] indirect0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:40.026000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] q0 is not in var_ranges, defaulting to unknown range.
W0614 00:48:40.026000 140616046391680 torch/fx/experimental/symbolic_shapes.py:4449] [15/0] q1 is not in var_ranges, defaulting to unknown range.
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf0,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[64, 1, 768]),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm,
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm}
V0614 00:48:40.088000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf1', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf0, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, r0)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp9
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[832],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=max,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=amax,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_4, amax, mul_3, add_2, sub}
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf2', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf0, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, i3)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.load(buf1, i2 + 64 * i1)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp9 - tmp10
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = ops.exp(tmp11)
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp12
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=exp,
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={add_2, sub_1, mul_4, mul_3, exp, sub}
V0614 00:48:40.089000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf3', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf2, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[832],
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=sum,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=sum_1,
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sum_1}
V0614 00:48:40.090000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf2, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf3, i2 + 64 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={convert_element_type_5, div_2}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf4', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf2, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf3, i2 + 64 * i1)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_5,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={convert_element_type_5, div_2}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 832, 64], stride=[64, 768, 1]),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_1,
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_1}
V0614 00:48:40.091000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg0_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg1_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf8', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg2_1, i1 + 3 * i0)
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.092000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf9', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg3_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf10', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg4_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf11', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg5_1, i1 + 3 * i0)
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.093000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf12', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg6_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf13', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg7_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf14', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg8_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf15', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg9_1, i1 + 3 * i0)
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.094000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf16', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg10_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf17', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg11_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf18', layout=FixedLayout('cpu', torch.int32, size=[132, 3], stride=[3, 1]), inputs=[ComputedBuffer(name='buf6', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg0_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf7', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg1_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf8', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg2_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf9', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg3_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf10', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg4_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf11', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg5_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf12', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg6_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf13', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg7_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf14', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg8_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf15', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg9_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf16', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg10_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf17', layout=NonOwningLayout('cpu', torch.int32, size=[11, 3], stride=[3, 1]), data=Pointwise(
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int32,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1 = index
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg11_1, i1 + 3 * i0)
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[11, 3],
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat}
V0614 00:48:40.095000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf19', layout=FixedLayout('cpu', torch.int64, size=[12, 11, 3], stride=[33, 3, 1]), data=Pointwise(
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.int64,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] i0, i1, i2 = index
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf18, i2 + 3 * i1 + 33 * i0)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.to_dtype(tmp0, torch.int64, src_dtype=torch.int32)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp1
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[12, 11, 3],
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={convert_element_type}
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.097000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.098000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.099000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.100000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf25', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.106000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf26,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=49152),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf25', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf20', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf21', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf22', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf23', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf24', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_1}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 1, 64]),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_2,
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_2}
V0614 00:48:40.107000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf27', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_3}
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf28', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, 768 + i3)
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 64],
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_3}
V0614 00:48:40.109000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf29', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=full_default,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={full_default}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf30', layout=FixedLayout('cpu', torch.float32, size=[1, 1, 1, 448], stride=[448, 448, 448, 1]), inputs=[ComputedBuffer(name='buf27', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_3}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf28', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, 768 + i3)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 64],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_3}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf29', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=full_default,
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={full_default}
V0614 00:48:40.110000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf31', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 256],
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=full_default_1,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={full_default_1}
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf32', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg13_1, 64 + i2)
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf19, ModularIndexing(i3 + 2112 * i1, 64, 396))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp0 * tmp2
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 192],
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_4}
V0614 00:48:40.111000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf33', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), inputs=[ComputedBuffer(name='buf31', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 256],
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=full_default_1,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={full_default_1}
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf32', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg13_1, 64 + i2)
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf19, ModularIndexing(i3 + 2112 * i1, 64, 396))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp0 * tmp2
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 192],
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_4}
V0614 00:48:40.113000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf34', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf26, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(buf30, r0)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf33, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp4 - tmp7
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = tmp8 * tmp9
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp3 + tmp10
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp11
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[448],
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=max,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=amax_1,
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_6, sub_2, minimum, amax_1, mul_5, add_3}
V0614 00:48:40.114000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf35', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf26, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(buf30, i3)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf33, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp4 - tmp7
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = tmp8 * tmp9
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp3 + tmp10
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = ops.load(buf34, i2 + 64 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp13 = tmp11 - tmp12
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp14 = ops.exp(tmp13)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp14
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=exp_1,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_6, sub_2, minimum, exp_1, mul_5, add_3, sub_3}
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf36', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf35, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[448],
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=sum,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=sum_2,
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sum_2}
V0614 00:48:40.115000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.116000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.117000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf42', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.118000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf43', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf35, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf36, i2 + 64 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_12,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={convert_element_type_12, div_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf44,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf43', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf35, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf36, i2 + 64 * i1)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_12,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={convert_element_type_12, div_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf42', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf37', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf38', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf39', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf40', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf41', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_2}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 448, 64], stride=[28672, 64, 1]),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_3,
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_3}
V0614 00:48:40.120000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf45,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[64, 768, 1], offset=98304),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 1, 768]),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_6,
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_6}
V0614 00:48:40.122000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.123000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.124000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf49', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_4,
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_4}
V0614 00:48:40.125000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf51,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 192, 1]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_4,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf49', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf46', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf47', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf48', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_5}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 1, 64]),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_4,
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_4}
V0614 00:48:40.126000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf52', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_6,
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_6}
V0614 00:48:40.127000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf53,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 192, 1]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf50', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[442368, 36864, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg12_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_4,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_4}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf52', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_6,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_6}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[12288, 1, 64]),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_5,
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_5}
V0614 00:48:40.129000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf54,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[64, 768, 1], offset=98304),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 1, 768], offset=589824),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_7,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_7}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf55', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf45, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, i4)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_24,
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sub_5, mul_12, mul_9, convert_element_type_24, add_5}
V0614 00:48:40.130000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf56', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf51, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg17_1, i4 + 192 * i3 + 12288 * i2)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_23,
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_11, convert_element_type_23, sub_4, mul_7, add_4}
V0614 00:48:40.131000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf57', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf53, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg13_1, 128 + i3 + 64 * i2)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf19, ModularIndexing(192 + i4 + 192 * i2 + 2112 * i1, 64, 396))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg13_1, 64 * tmp6 + ModularIndexing(i4, 1, 64))
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp5 * tmp7
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp4 - tmp8
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp9 * tmp10
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = tmp3 + tmp11
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp13 = ops.to_dtype(tmp12, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp13
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_26,
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sub_7, convert_element_type_26, mul_8, add_7, mul_14}
V0614 00:48:40.132000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf58', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf54, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, 768 + i4)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_25,
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_10, add_6, sub_6, mul_13, convert_element_type_25}
V0614 00:48:40.133000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf59', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), inputs=[ComputedBuffer(name='buf55', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf45, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, i4)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_24,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sub_5, mul_12, mul_9, convert_element_type_24, add_5}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf56', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf51, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg17_1, i4 + 192 * i3 + 12288 * i2)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_23,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_11, convert_element_type_23, sub_4, mul_7, add_4}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf57', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 192], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf53, i4 + 192 * i3 + 12288 * i2 + 110592 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg13_1, 128 + i3 + 64 * i2)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf19, ModularIndexing(192 + i4 + 192 * i2 + 2112 * i1, 64, 396))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg13_1, 64 * tmp6 + ModularIndexing(i4, 1, 64))
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp5 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp4 - tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp9 * tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = tmp3 + tmp11
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp13 = ops.to_dtype(tmp12, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp13
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 192],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_26,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sub_7, convert_element_type_26, mul_8, add_7, mul_14}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf58', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf54, i4 + 64 * i3 + 4096 * i2 + 36864 * i1)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, 768 + i4)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.to_dtype(tmp9, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp10
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_25,
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_10, add_6, sub_6, mul_13, convert_element_type_25}
V0614 00:48:40.134000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf60', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 1], stride=[6912, 576, 64, 1, 6912]), data=Reduction(
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, _ = index
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf59, r0 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp1
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 1],
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[512],
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=max,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=amax_2,
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={amax_2, convert_element_type_27}
V0614 00:48:40.136000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf61', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf59, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.to_dtype(tmp0, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.load(buf60, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 - tmp2
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.exp(tmp3)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp4
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=exp_2,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={exp_2, sub_8, convert_element_type_27}
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf62', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 9, 64, 1], stride=[6912, 576, 64, 1, 6912]), data=Reduction(
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, _ = index
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, r0 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 1],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[512],
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=sum,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=sum_3,
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sum_3}
V0614 00:48:40.137000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.138000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf66', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.139000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_28,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_4, convert_element_type_28}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf68,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_28,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_4, convert_element_type_28}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[32768, 512, 1], offset=64),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf66', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), inputs=[ComputedBuffer(name='buf63', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 49152 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf64', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 98304 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf65', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 147456 + i4 + 64 * i1 + 768 * i3 + 49152 * i2)
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 64],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_6}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 192, 64], stride=[12288, 64, 1]),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_8,
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_8}
V0614 00:48:40.140000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf69', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_7,
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_7}
V0614 00:48:40.142000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf70,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[108, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_28,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_4, convert_element_type_28}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 64, 192], stride=[32768, 512, 1], offset=256),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf69', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 192, 64], stride=[1327104, 110592, 12288, 64, 1]), data=Pointwise(
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(12288 + i4 + 64 * i3 + 12288 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i4 + 64 * i3, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i4, 1, 64))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 192, 64],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=clone_7,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={clone_7}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[108, 192, 64], stride=[12288, 64, 1]),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_9,
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_9}
V0614 00:48:40.143000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf71,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_28,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_4, convert_element_type_28}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[294912, 512, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_10,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_10}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf72,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[36864, 64, 1]),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf67', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 512], stride=[3538944, 294912, 32768, 512, 1]), data=Pointwise(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf61, i4 + 512 * i3 + 32768 * i2 + 294912 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf62, i3 + 64 * i2 + 576 * i1)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 9, 64, 512],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_28,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_4, convert_element_type_28}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 576, 64], stride=[294912, 512, 1], offset=448),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=589824),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_11,
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_11}
V0614 00:48:40.144000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.145000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.146000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.147000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf78', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.152000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf79,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=540672),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf78', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf73', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf74', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf75', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf76', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg14_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf77', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg14_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_8}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 1, 64]),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_12,
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_12}
V0614 00:48:40.153000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf80', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 64],
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf81', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, 640 + i3)
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.155000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf82', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf83', layout=FixedLayout('cpu', torch.float32, size=[1, 1, 1, 448], stride=[448, 448, 448, 1]), inputs=[ComputedBuffer(name='buf80', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 64], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, i3)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 64],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf81', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg16_1, 640 + i3)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf82', layout=NonOwningLayout('cpu', torch.float32, size=[1, 1, 1, 192], stride=[448, 448, 448, 1]), data=Pointwise(
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, _, _, i3 = index
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 1, 1, 192],
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_10}
V0614 00:48:40.156000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf84', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 256],
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_11}
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf85', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg13_1, 704 + i2)
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf19, ModularIndexing(1920 + i3 + 2112 * i1, 64, 396))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp0 * tmp2
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 192],
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_11}
V0614 00:48:40.157000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf86', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), inputs=[ComputedBuffer(name='buf84', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 256], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.constant(1, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 256],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_11}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf85', layout=NonOwningLayout('cpu', torch.float32, size=[1, 12, 64, 192], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg13_1, 704 + i2)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf19, ModularIndexing(1920 + i3 + 2112 * i1, 64, 396))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.load(arg13_1, 64 * tmp1 + ModularIndexing(i3, 1, 64))
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp0 * tmp2
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 192],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_11}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf87', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf79, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(buf83, r0)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf86, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp4 - tmp7
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = tmp8 * tmp9
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp3 + tmp10
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp11
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[448],
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=max,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=amax_3,
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_16, sub_9, mul_15, add_11, amax_3, minimum_1}
V0614 00:48:40.159000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf88', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf79, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(buf83, i3)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = ops.load(buf86, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.minimum(tmp5, tmp6)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp4 - tmp7
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = tmp8 * tmp9
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp3 + tmp10
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = ops.load(buf87, i2 + 64 * i1)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp13 = tmp11 - tmp12
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp14 = ops.exp(tmp13)
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp14
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=exp_3,
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={exp_3, mul_16, sub_9, mul_15, sub_10, add_11, minimu...
V0614 00:48:40.160000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf89', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf88, r0 + 448 * i2 + 28672 * i1)
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[448],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=sum,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=sum_4,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sum_4}
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.161000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.162000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.163000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ConcatKernel(name='buf95', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.164000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf96', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf88, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf89, i2 + 64 * i1)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_41,
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_5, convert_element_type_41}
V0614 00:48:40.165000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf97,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf96', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 448], stride=[344064, 28672, 448, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf88, i3 + 448 * i2 + 28672 * i1)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf89, i2 + 64 * i1)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 448],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_41,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_5, convert_element_type_41}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 448], stride=[28672, 448, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ConcatKernel(name='buf95', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 448, 64], stride=[344064, 28672, 64, 1]), inputs=[ComputedBuffer(name='buf90', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf91', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 491520 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf92', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 540672 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf93', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 64, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(arg15_1, 589824 + i3 + 64 * i1 + 768 * i2)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )), ComputedBuffer(name='buf94', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 192, 64], stride=[344064, 28672, 64, 1]), data=Pointwise(
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf19, 33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.index_expr(33 * ModularIndexing(122880 + i3 + 64 * i2 + 135168 * i1, 135168, 12) + ModularIndexing(122880 + i3 + 64 * i2, 4096, 33), dtype=torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = ops.constant(33, torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = tmp1 // tmp2
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(13, torch.int64)
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = tmp3 * tmp4
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp0 + tmp5
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.load(arg15_1, 64 * ModularIndexing(tmp6, 13, 12) + 768 * ModularIndexing(i3 + 64 * i2, 64, 64) + 49152 * ModularIndexing(tmp6, 1, 13) + ModularIndexing(i3, 1, 64))
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp7
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 192, 64],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_9}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))])
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 448, 64], stride=[28672, 64, 1]),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_13,
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_13}
V0614 00:48:40.166000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf98,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg12_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[64, 768, 1], offset=589824),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg14_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[64, 1, 768]),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_14,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_14}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf99', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf98, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, r0)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp9
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[832],
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=max,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=amax_4,
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={mul_17, add_12, mul_4, amax_4, sub}
V0614 00:48:40.168000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf100', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf98, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.constant(0.125, torch.bfloat16)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 * tmp1
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.float32, src_dtype=torch.bfloat16)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = ops.constant(1.0, torch.float32)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(arg16_1, i3)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 - tmp5
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp7 = ops.constant(-10000.0, torch.float32)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp8 = tmp6 * tmp7
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp9 = tmp3 + tmp8
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp10 = ops.load(buf99, i2 + 64 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp11 = tmp9 - tmp10
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp12 = ops.exp(tmp11)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp12
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=exp_4,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={exp_4, mul_17, add_12, mul_4, sub, sub_12}
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf101', layout=FixedLayout('cpu', torch.float32, size=[1, 12, 64, 1], stride=[768, 64, 1, 768]), data=Reduction(
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.float32,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index, rindex):
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, _ = index
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] r0 = rindex
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf100, r0 + 832 * i2 + 53248 * i1)
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 1],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_ranges=[832],
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] reduction_type=sum,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=sum_5,
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={sum_5}
V0614 00:48:40.169000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf102', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf100, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf101, i2 + 64 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_48,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_6, convert_element_type_48}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ExternKernelOut(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name='extern_kernels.bmm',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] name=buf103,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] layout=FixedLayout('cpu', torch.bfloat16, size=[12, 64, 64], stride=[4096, 64, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] inputs=[ReinterpretView(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ComputedBuffer(name='buf102', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 64, 832], stride=[638976, 53248, 832, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf100, i3 + 832 * i2 + 53248 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf101, i2 + 64 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 / tmp1
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.to_dtype(tmp2, torch.bfloat16, src_dtype=torch.float32)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp3
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 64, 832],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=convert_element_type_48,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={div_6, convert_element_type_48}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 64, 832], stride=[53248, 832, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ), ReinterpretView(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] StorageBox(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] InputBuffer(name='arg15_1', layout=FixedLayout('cpu', torch.bfloat16, size=[1, 12, 832, 64], stride=[638976, 64, 768, 1]))
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] FixedLayout('cpu', torch.bfloat16, size=[12, 832, 64], stride=[64, 768, 1]),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] constant_args=(),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwargs={},
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] output_view=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] python_kernel_name=extern_kernels.bmm,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] cpp_kernel_name=at::bmm_out,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ordered_kwargs_for_cpp_kernel=(),
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] op_overload=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] arg_properties=[{}, {}],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] kwarg_properties=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] unbacked_bindings={},
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=bmm_15,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={bmm_15}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] )
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf104', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 1, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, _, i3, i4 = index
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf5, i4 + 64 * i3 + 4096 * i1)
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 1, 64, 64],
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_12}
V0614 00:48:40.170000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf105', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 1, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, _, i3, i4 = index
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf44, i4 + 64 * i3 + 4096 * i1)
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp0
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ranges=[1, 12, 1, 64, 64],
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origin_node=None,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] origins={cat_12}
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] ))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] scheduling ComputedBuffer(name='buf106', layout=NonOwningLayout('cpu', torch.bfloat16, size=[1, 12, 9, 64, 64], stride=[638976, 53248, 4096, 64, 1]), data=Pointwise(
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] 'cpu',
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] torch.bfloat16,
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] def inner_fn(index):
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] _, i1, i2, i3, i4 = index
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp0 = ops.load(buf68, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12) + ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp1 = ops.load(buf70, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12) + ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp2 = tmp0 + tmp1
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp3 = ops.load(buf71, i4 + 64 * i3 + 4096 * ModularIndexing(ModularIndexing(i2, 1, 9), 1, 9) + 36864 * ModularIndexing(9 * ModularIndexing(i2 + 9 * i1, 9, 12) + ModularIndexing(i2, 1, 9), 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp4 = tmp2 + tmp3
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp5 = ops.load(buf72, i4 + 64 * i3 + 4096 * ModularIndexing(i2, 1, 9) + 36864 * ModularIndexing(i2 + 9 * i1, 9, 12))
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] tmp6 = tmp4 + tmp5
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:1601] [15/0] return tmp6
V0614 00:48:40.171000 140616046391680 torch/_inductor/scheduler.py:160
View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment